def main(args=sys.argv): if len(args) == 1: print "Usage: %s [dump|rpcreceive|rpcsend]" % args[0] return 1 if args[1] == "dump": if len(args) != 3: print "Usage: %s dump input_file" % args[0] return 1 for d in datafile.DataFileReader(file_or_stdin(args[2]), io.DatumReader()): print repr(d) elif args[1] == "rpcreceive": usage_str = "Usage: %s rpcreceive uri protocol_file " % args[0] usage_str += "message_name (-data d | -file f)" if len(args) not in [5, 7]: print usage_str return 1 uri, proto, msg = args[2:5] datum = None if len(args) > 5: if args[5] == "-file": reader = open(args[6], 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) datum = dfr.next() elif args[5] == "-data": print "JSON Decoder not yet implemented." return 1 else: print usage_str return 1 run_server(uri, proto, msg, datum) elif args[1] == "rpcsend": usage_str = "Usage: %s rpcsend uri protocol_file " % args[0] usage_str += "message_name (-data d | -file f)" if len(args) not in [5, 7]: print usage_str return 1 uri, proto, msg = args[2:5] datum = None if len(args) > 5: if args[5] == "-file": reader = open(args[6], 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) datum = dfr.next() elif args[5] == "-data": print "JSON Decoder not yet implemented." return 1 else: print usage_str return 1 send_message(uri, proto, msg, datum) return 0
def _read_avro(fs, path, offset, length): contents = '' try: fhandle = fs.open(path) try: fhandle.seek(offset) data_file_reader = datafile.DataFileReader(fhandle, io.DatumReader()) contents_list = [] read_start = fhandle.tell() # Iterate over the entire sought file. for datum in data_file_reader: read_length = fhandle.tell() - read_start if read_length > length and len(contents_list) > 0: break else: datum_str = str(datum) + "\n" contents_list.append(datum_str) data_file_reader.close() contents = "".join(contents_list) except: logging.warn("Could not read avro file at %s" % path, exc_info=True) raise PopupException(_("Failed to read Avro file.")) finally: fhandle.close() return contents
def test_container(self): writer = open('data.avro', 'wb') datum_writer = io.DatumWriter() schema_object = schema.parse("""\ { "type": "record", "name": "StringPair", "doc": "A pair of strings.", "fields": [ {"name": "left", "type": "string"}, {"name": "right", "type": "string"} ] } """) dfw = datafile.DataFileWriter(writer, datum_writer, schema_object) datum = {'left': 'L', 'right': 'R'} dfw.append(datum) dfw.close() reader = open('data.avro', 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) data = [] for datum in dfr: data.append(datum) self.assertEquals(1, len(data)) self.assertEquals(datum, data[0])
def test_interop(self): ran = False print() print('TEST INTEROP') print('============') print() for f in os.listdir(_INTEROP_DATA_DIR): ran = True base_ext = os.path.splitext(os.path.basename(f))[0].split('_', 1) if len(base_ext) < 2 or base_ext[1] in datafile.VALID_CODECS: print('READING %s' % f) print('') # read data in binary from file reader = open(os.path.join(_INTEROP_DATA_DIR, f), 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) i = 0 for i, datum in enumerate(dfr, 1): assert datum is not None assert i > 0 else: print('SKIPPING %s due to an unsupported codec' % f) print('') self.assertTrue(ran, "Didn't find any interop data files to test")
def test_context_manager(self): # Context manager was introduced as a first class # member only in Python 2.6 and above. import sys if sys.version_info < (2, 6): print 'Skipping context manager tests on this Python version.' return # Test the writer with a 'with' statement. writer = open(FILENAME, 'wb') datum_writer = io.DatumWriter() sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] schema_object = schema.parse(sample_schema) with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: dfw.append(sample_datum) self.assertTrue(writer.closed) # Test the reader with a 'with' statement. datums = [] reader = open(FILENAME, 'rb') datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: for datum in dfr: datums.append(datum) self.assertTrue(reader.closed)
def testMetadata(self): file_path = self.NewTempFile() # Test the writer with a 'with' statement. with open(file_path, 'wb') as writer: datum_writer = io.DatumWriter() sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] schema_object = schema.parse(sample_schema) with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: dfw.SetMeta('test.string', 'foo') dfw.SetMeta('test.number', '1') dfw.append(sample_datum) self.assertTrue(writer.closed) # Test the reader with a 'with' statement. datums = [] with open(file_path, 'rb') as reader: datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: self.assertEqual(b'foo', dfr.GetMeta('test.string')) self.assertEqual(b'1', dfr.GetMeta('test.number')) for datum in dfr: datums.append(datum) self.assertTrue(reader.closed)
def data_access_dir_binary_avro(): dir_location = request.args.get('datadir_avro') print dir_location dir_url = base_url + dir_location + '?user.name=hdfs&op=OPEN' r = requests.get(dir_url, stream=True) print r.status_code with open('p.avro', 'wb') as fo: for chunk in r: fo.write(chunk) fo.close() print "created" OUTFILE_NAME = 'p.avro' rec_reader = io.DatumReader() df_reader = datafile.DataFileReader(open(OUTFILE_NAME, 'rb'), rec_reader) # Read all records stored inside mydata = [] for record in df_reader: mydata.append(record) de = pd.DataFrame(mydata) #r=requests.get(dir_url) return de.to_html()
def testInterop(self): with tempfile.NamedTemporaryFile() as temp_path: WriteDataFile(temp_path.name, INTEROP_DATUM, INTEROP_SCHEMA) # read data in binary from file datum_reader = io.DatumReader() with open(temp_path.name, 'rb') as reader: dfr = datafile.DataFileReader(reader, datum_reader) for datum in dfr: self.assertEqual(INTEROP_DATUM, datum)
def testInterop(self): with tempfile.NamedTemporaryFile() as temp_path: write_data_file(temp_path.name, INTEROP_DATUM, get_interop_schema()) # read data in binary from file datum_reader = io.DatumReader() with open(temp_path.name, "rb") as reader: dfr = datafile.DataFileReader(reader, datum_reader) for datum in dfr: self.assertEqual(INTEROP_DATUM, datum)
def testAppend(self): correct = 0 codecs_to_validate = get_codecs_to_validate() for iexample, (writer_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE): for codec in codecs_to_validate: file_path = self.NewTempFile() logging.debug( 'Performing append with codec %r in file %s for example #%d\n' 'Writing datum: %r using writer schema:\n%s', codec, file_path, iexample, datum, writer_schema) logging.debug('Creating data file %r', file_path) with open(file_path, 'wb') as writer: datum_writer = io.DatumWriter() schema_object = schema.parse(writer_schema) with datafile.DataFileWriter( writer=writer, datum_writer=datum_writer, writer_schema=schema_object, codec=codec, ) as dfw: dfw.append(datum) logging.debug('Appending data to %r', file_path) for i in range(9): with open(file_path, 'ab+') as writer: with datafile.DataFileWriter(writer, io.DatumWriter()) as dfw: dfw.append(datum) logging.debug('Reading appended data from %r', file_path) with open(file_path, 'rb') as reader: datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: appended_data = list(dfr) logging.debug( 'Appended data has %d items: %r', len(appended_data), appended_data) if ([datum] * 10) == appended_data: correct += 1 else: logging.error( 'Appended data does not match:\n' 'Expect: %r\n' 'Actual: %r', [datum] * 10, appended_data) self.assertEqual( correct, len(codecs_to_validate) * len(SCHEMAS_TO_VALIDATE))
def test_append(self): print '' print 'TEST APPEND' print '===========' print '' correct = 0 for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE): for codec in CODECS_TO_VALIDATE: if (codec == 'snappy'): try: import snappy except: print 'Snappy not present. Skipping.' correct += 1 continue print '' print 'SCHEMA NUMBER %d' % (i + 1) print '================' print '' print 'Schema: %s' % example_schema print 'Datum: %s' % datum print 'Codec: %s' % codec # write data in binary to file once writer = open(FILENAME, 'wb') datum_writer = io.DatumWriter() schema_object = schema.parse(example_schema) dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec) dfw.append(datum) dfw.close() # open file, write, and close nine times for i in range(9): writer = open(FILENAME, 'ab+') dfw = datafile.DataFileWriter(writer, io.DatumWriter()) dfw.append(datum) dfw.close() # read data in binary from file reader = open(FILENAME, 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) appended_data = [] for datum in dfr: appended_data.append(datum) print 'Appended Data: %s' % appended_data print 'Appended Data Length: %d' % len(appended_data) is_correct = [datum] * 10 == appended_data if is_correct: correct += 1 print 'Correct Appended: %s' % is_correct print '' os.remove(FILENAME) self.assertEquals(correct, len(CODECS_TO_VALIDATE)*len(SCHEMAS_TO_VALIDATE))
def read_avro_file(insource='results.avro'): rec_reader = io.DatumReader() if insource == sys.stdin: input = sys.stdin.read() temp_file = StringIO(input) df_reader = datafile.DataFileReader(temp_file, rec_reader) else: df_reader = datafile.DataFileReader(open(insource), rec_reader) del stored[:] """ for record in df_reader: size = record['size'] for i in range(size): i = i+1 arg = record["arg%s"%(i)] #print arg stored.append(arg) """ return df_reader
def testInterop(self): datum_reader = io.DatumReader() for avro_file in glob.glob('../../build/interop/data/*.avro'): base_ext = os.path.splitext(os.path.basename(avro_file))[0].split( '_', 1) if len(base_ext) < 2 or base_ext[1] in datafile.VALID_CODECS: with open(avro_file, 'rb') as reader, \ datafile.DataFileReader(reader, datum_reader) as dfr: i = 0 for i, datum in enumerate(dfr, 1): self.assertIsNotNone(datum) self.assertGreater(i, 0)
def main(): # Create a datum writer. rec_reader = io.DatumReader() # Define files to convert into parquet files files = ['logs_0.avro', 'logs_1.avro', 'logs_2.avro', 'logs_3.avro'] pqfiles = [] #files = ['logs_small.avro'] # Loop to process the files for f in files: # Print message print("Converting", f, "to parquet format..") # Define reader to avro format. df_reader = datafile.DataFileReader(open(f, "rb"), rec_reader) # Convert the records from avro into pandas dataframe. df = pd.DataFrame.from_records(df_reader) # Convert pandas dataframe into parquet table table = pa.Table.from_pandas(df) # Set the avro file name (new) and append on the list. newfile = str(f).replace('.avro', '.parquet') pqfiles.append(newfile) # Write the data into a parquet file format. pq.write_table(table, newfile) # Close the dataframe reader df_reader.close() # S3 setup s3 = boto3.resource('s3', aws_access_key_id=ACCESS_KEY_ID, aws_secret_access_key=ACCESS_SECRET_KEY, config=Config(signature_version='s3v4')) # Loop to save the files on S3. for f in pqfiles: # File to upload on S3 upfile = open(f, 'rb') # Upload the file on S3. print("Uploading", f, "on S3 AWS..") s3.Bucket(BUCKET_NAME).put_object(Key=f, Body=upfile) # Close the file. upfile.close()
def testRoundTrip(self): correct = 0 codecs_to_validate = get_codecs_to_validate() for iexample, (writer_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE): for codec in codecs_to_validate: file_path = self.NewTempFile() # Write the datum this many times in the data file: nitems = 10 logging.debug( 'Performing round-trip with codec %r in file %s for example #%d\n' 'Writing datum: %r using writer schema:\n%s', codec, file_path, iexample, datum, writer_schema) logging.debug('Creating data file %r', file_path) with open(file_path, 'wb') as writer: datum_writer = io.DatumWriter() schema_object = schema.parse(writer_schema) with datafile.DataFileWriter( writer=writer, datum_writer=datum_writer, writer_schema=schema_object, codec=codec, ) as dfw: for _ in range(nitems): dfw.append(datum) logging.debug('Reading data from %r', file_path) with open(file_path, 'rb') as reader: datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: round_trip_data = list(dfr) logging.debug( 'Round-trip data has %d items: %r', len(round_trip_data), round_trip_data) if ([datum] * nitems) == round_trip_data: correct += 1 else: logging.error( 'Round-trip data does not match:\n' 'Expect: %r\n' 'Actual: %r', [datum] * nitems, round_trip_data) self.assertEqual( correct, len(codecs_to_validate) * len(SCHEMAS_TO_VALIDATE))
def test_empty_datafile(self): """A reader should not fail to read a file consisting of a single empty block.""" sample_schema = schema.parse(SCHEMAS_TO_VALIDATE[1][0]) with datafile.DataFileWriter(open(FILENAME, 'wb'), io.DatumWriter(), sample_schema) as dfw: dfw.flush() # Write an empty block dfw.encoder.write_long(0) dfw.encoder.write_long(0) dfw.writer.write(dfw.sync_marker) with datafile.DataFileReader(open(FILENAME, 'rb'), io.DatumReader()) as dfr: self.assertEqual([], list(dfr))
def read_avro_file(name): # Create a 'record' (datum) reader # You can pass an 'expected=SCHEMA' kwarg # if you want it to expect a particular # schema (Strict) rec_reader = io.DatumReader() # Create a 'data file' (avro file) reader df_reader = datafile.DataFileReader(open(name), rec_reader) # Read all records stored inside for record in df_reader: with open(record['filename'], 'wb') as f: f.write(record['content'])
def read_avro_file(): # Create a 'record' (datum) reader # You can pass an 'expected=SCHEMA' kwarg # if you want it to expect a particular # schema (Strict) rec_reader = io.DatumReader() # Create a 'data file' (avro file) reader df_reader = datafile.DataFileReader(open(OUTFILE_NAME), rec_reader) # Read all records stored inside for record in df_reader: print record['name'], record['age'] print record['address'], record['value']
def test_interop(self): print '' print 'TEST INTEROP' print '============' print '' for f in os.listdir('/home/blue/avro/lang/py/../../build/interop/data'): print 'READING %s' % f print '' # read data in binary from file reader = open(os.path.join('/home/blue/avro/lang/py/../../build/interop/data', f), 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) for datum in dfr: assert datum is not None
def test_interop(self): print('') print('TEST INTEROP') print('============') print('') for f in os.listdir('@INTEROP_DATA_DIR@'): print('READING %s' % f) print('') # read data in binary from file reader = open(os.path.join('@INTEROP_DATA_DIR@', f), 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) for datum in dfr: assert datum is not None
def test_interop(self): print '' print 'TEST INTEROP' print '============' print '' for f in os.listdir(INTEROP_DATA_DIR): print 'READING %s' % f print '' # read data in binary from file reader = open(os.path.join(INTEROP_DATA_DIR, f), 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) for datum in dfr: assert datum is not None
def test_round_trip(self): print '' print 'TEST ROUND TRIP' print '===============' print '' correct = 0 for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE): for codec in CODECS_TO_VALIDATE: if (codec == 'snappy'): try: import snappy except: print 'Snappy not present. Skipping.' correct += 1 continue print '' print 'SCHEMA NUMBER %d' % (i + 1) print '================' print '' print 'Schema: %s' % example_schema print 'Datum: %s' % datum print 'Codec: %s' % codec # write data in binary to file 10 times writer = open(FILENAME, 'wb') datum_writer = io.DatumWriter() schema_object = schema.parse(example_schema) dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec) for i in range(10): dfw.append(datum) dfw.close() # read data in binary from file reader = open(FILENAME, 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) round_trip_data = [] for datum in dfr: round_trip_data.append(datum) print 'Round Trip Data: %s' % round_trip_data print 'Round Trip Data Length: %d' % len(round_trip_data) is_correct = [datum] * 10 == round_trip_data if is_correct: correct += 1 print 'Correct Round Trip: %s' % is_correct print '' os.remove(FILENAME) self.assertEquals(correct, len(CODECS_TO_VALIDATE)*len(SCHEMAS_TO_VALIDATE))
def process_file(afile, output_path): rec_reader = io.DatumReader() df_reader = datafile.DataFileReader(open(afile),rec_reader) for record in df_reader: basename = os.path.basename(record['file_path']) dirname = os.path.dirname(record['file_path']) x = output_path + dirname x = x.replace('//','/') c = ['mkdir', '-p', x] subprocess.call(c) fpath = x + '/' + basename print fpath with open(fpath, 'w') as f: b64 = base64.b64decode(record['content']) f.write(b64) f.close()
def test_interop(self): print '' print 'TEST INTEROP' print '============' print '' for f in os.listdir('@INTEROP_DATA_DIR@'): print 'READING %s' % f print '' # read data in binary from file reader = open(os.path.join('@INTEROP_DATA_DIR@', f), 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) i = 0 for i, datum in enumerate(dfr, 1): assert datum is not None assert i > 0
def cat(opts, args): if not args: raise AvroError('No files to show') for filename in args: try: fo = open(filename, 'rb') except (OSError, IOError) as e: raise AvroError('Cannot open %s - %s' % (filename, e)) avro = datafile.DataFileReader(fo, avro_io.DatumReader()) if opts.print_schema: print_schema(avro) continue print_avro(avro, opts)
def test_round_trip(self): print('') print('TEST ROUND TRIP') print('===============') print('') correct = 0 for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE): for codec in CODECS_TO_VALIDATE: print('') print('SCHEMA NUMBER %d' % (i + 1)) print('================') print('') print('Schema: %s' % example_schema) print('Datum: %s' % datum) print('Codec: %s' % codec) # write data in binary to file 10 times writer = open(FILENAME, 'wb') datum_writer = io.DatumWriter() schema_object = schema.parse(example_schema) dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec) for i in range(10): dfw.append(datum) dfw.close() # read data in binary from file reader = open(FILENAME, 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) round_trip_data = [] for datum in dfr: round_trip_data.append(datum) print('Round Trip Data: %s' % round_trip_data) print('Round Trip Data Length: %d' % len(round_trip_data)) is_correct = [datum] * 10 == round_trip_data if is_correct: correct += 1 print('Correct Round Trip: %s' % is_correct) print('') os.remove(FILENAME) self.assertEquals(correct, len(CODECS_TO_VALIDATE) * len(SCHEMAS_TO_VALIDATE))
def test_context_manager(self): """Test the writer with a 'with' statement.""" writer = open(FILENAME, 'wb') datum_writer = io.DatumWriter() sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] schema_object = schema.parse(sample_schema) with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: dfw.append(sample_datum) self.assertTrue(writer.closed) # Test the reader with a 'with' statement. datums = [] reader = open(FILENAME, 'rb') datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: for datum in dfr: datums.append(datum) self.assertTrue(reader.closed)
def clean(data): try: json.dumps(data) return data except: LOG.exception('Failed to dump data as JSON') cleaned = {} lim = [0] if isinstance( data, str ): # Not JSON dumpable, meaning some sort of bytestring or byte data #detect if avro file if (data[:3] == '\x4F\x62\x6A'): #write data to file in memory output = StringIO.StringIO() output.write(data) #read and parse avro rec_reader = io.DatumReader() df_reader = datafile.DataFileReader(output, rec_reader) return json.dumps(clean([record for record in df_reader])) return base64.b64encode(data) if hasattr(data, "__iter__"): if type(data) is dict: for i in data: cleaned[i] = clean(data[i]) elif type(data) is list: cleaned = [] for i, item in enumerate(data): cleaned += [clean(item)] else: for i, item in enumerate(data): cleaned[i] = clean(item) else: for key in dir(data): value = getattr(data, key) if value is not None and not hasattr( value, '__call__') and sum([ int(bool(re.search(ignore, key))) for ignore in ignored_fields ]) == 0: cleaned[key] = clean(value) return cleaned
def testContextManager(self): file_path = self.NewTempFile() # Test the writer with a 'with' statement. with open(file_path, 'wb') as writer: datum_writer = io.DatumWriter() sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] schema_object = schema.parse(sample_schema) with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: dfw.append(sample_datum) self.assertTrue(writer.closed) # Test the reader with a 'with' statement. datums = [] with open(file_path, 'rb') as reader: datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: for datum in dfr: datums.append(datum) self.assertTrue(reader.closed)
def read_avro_file(): # Create a 'record' (datum) reader # You can pass an 'expected=SCHEMA' kwarg # if you want it to expect a particular # schema (Strict) rec_reader = io.DatumReader() # Create a 'data file' (avro file) reader df_reader = datafile.DataFileReader(open(INFILE_NAME), rec_reader) # Read all records stored inside n = 1 for record in df_reader: # print record n = n + 1 # print record['name'], record['age'] # print record['address'], record['value'] # Do whatever read-processing you wanna do # for each record here ... print "No. of Records in file :- {0}".format(str(n))