Python DataFileReader.close 예제들, avro.datafile.DataFileReader.close Python 예제들

예제 #1

0

파일 보기

파일: benchmark.py 프로젝트: john-minted/soa-benchmarks

  def read(self, format):
    time_start = time.time()

    if format == 'json':
      with open('./output/output.json') as file:
        json.loads(file.read())

    if format == 'jsch':
      with open('./output/output.json') as file:
        validate(json.loads(file.read()), self._schema_json)

    elif format == 'avro':
      reader = DataFileReader(open('./output/output.avro', 'r'), DatumReader())
      for user in reader:
        pass
      reader.close()

    elif format == 'protobuf':
      with open('./output/output.pb', 'rb') as file:
        addressbook_pb2.AddressBook().ParseFromString(file.read())

    elif format == 'gzjson':
      with gzip.open('./output/output.jsz', 'rb') as file:
        json.loads(file.read())

    time_end = time.time()

    return time_end - time_start

예제 #2

0

파일 보기

파일: m_avro_utils.py 프로젝트: magsv/m_etp_services

def deserializeDataFromFile2Str(inputFile):
	logging.debug("Deserializing file:"+inputFile)
	reader = DataFileReader(open(inputFile, "r"), DatumReader())
	data=""
	for item in reader:
		data=data+str(item)
	reader.close()
	return data

예제 #3

0

파일 보기

파일: basic_types.py 프로젝트: MaxPoint/cyavro

    def generic_dataframe(self, df, avro_schema, assert_fns=None):
        """Generic test running function for arbitrary avro schemas.

        Writes a dataframe containing the records to avro.

        Reads back and compares with the original
        """
        print(avro_schema)

        cyavro.write_avro_file_from_dataframe(df, self.filename,
                                              json.dumps(avro_schema),
                                              codec='null'
                                              )

        if assert_fns is None:
            assert_fns = {}

        df_read = cyavro.read_avro_file_as_dataframe(self.filename)

        import avro.schema
        from avro.datafile import DataFileReader, DataFileWriter
        from avro.io import DatumReader, DatumWriter

        with open(self.filename, 'rb') as fo:
            reader = DataFileReader(fo, DatumReader())
            records = []
            for user in reader:
                records.append(user)
            df_reference = pd.DataFrame(records)
            reader.close()

        success = True

        for col in avro_schema["fields"]:
            colname = col['name']
            assert_fn = assert_fns.get(colname, np.testing.assert_array_equal)

            def print_fail_header(s):
                print('#' * len(s))
                print("FAIL: Column {}".format(col))
                print('#' * len(s))
                print(s)

            try:
                assert_fn(df_read[colname], df[colname])
            except AssertionError:
                print_fail_header("Failed for cyavro read comparison  {}\n".format(col))
                traceback.print_exc(file=sys.stdout)
                success = False

            try:
                assert_fn(df_reference[colname], df[colname])
            except AssertionError:
                print_fail_header("Failed for cyavro write comparison {}\n".format(col))
                traceback.print_exc(file=sys.stdout)
                success = False

        assert success

예제 #4

0

파일 보기

파일: example1.py 프로젝트: 3rwww1/Hadoop

def testRead(filename):
    fd = open(filename, 'rb')
    datum_writer = DatumReader()
    freader = DataFileReader(fd, datum_writer)
    for datum in freader:
        print datum['name'], datum['company']
        print datum['website']
        print
    freader.close()

예제 #5

0

파일 보기

파일: test-file.py 프로젝트: 3rwww1/Hadoop

def testRead(filename):
    fd = open(filename, 'rb')

    datum = DatumReader()
    reader = DataFileReader(fd, datum)

    for record in reader:
        print record['name'], record['age']

    reader.close()

예제 #6

0

파일 보기

파일: process_user_events_avro.py 프로젝트: tomford/bbm-misc

def main():

    if len(sys.argv) < 3:
        print "Usage:", sys.argv[0]
        print "add [num of events to add] filename"
        print "list filename"
        exit(1)

    command = sys.argv[1]

    if command == 'add':

        noEvents = sys.argv[2]
        filename = sys.argv[3]

        # load existing events

        existingEvents = {}

        try:
            reader = DataFileReader(open(filename, "rb"), DatumReader())
            existingEvents = reader
            reader.close()
        except IOError:
            print filename + ": Could not open file.  Creating a new one."

        # Write back out to disk

        try:

            schema = avro.schema.parse(open("etc/userevent.avsc").read())

            f = open(filename, "w")
            writer = DataFileWriter(f, DatumWriter(), schema)

            # Append new user events

            for i in range(0, int(noEvents)):
                newEvent = createUserEvent()
                print newEvent
                writer.append(newEvent)

            writer.close()

            print "Wrote {0} user events".format(noEvents)
        except IOError:
            print filename + ": Could not save file."

    elif command == 'list':

        listAllUserEvents(sys.argv[2])

    else:
        print "Unregistered command. Exiting"
        sys.exit(1)

예제 #7

0

파일 보기

파일: weights-vapor-connector.py 프로젝트: vrdel/argo-egi-connectors

def loadOldData(filename):
    oldDataDict = dict()

    if not os.path.isfile(filename):
        return oldDataDict

    reader = DataFileReader(open(filename, "r"), DatumReader())
    for weight in reader:
        oldDataDict[weight["site"]] = weight["weight"]
    reader.close()

    return oldDataDict

예제 #8

0

파일 보기

파일: avro_file_decoder.py 프로젝트: CodethinkLabs/pmacct

def main():
	try:
		opts, args = getopt.getopt(sys.argv[1:], "hi:s:", ["help", "input-file=",
						"schema="])
	except getopt.GetoptError as err:
		# print help information and exit:
		print str(err) # will print something like "option -a not recognized"
		usage(sys.argv[0])
		sys.exit(2)

	avro_file = None
	avro_schema_file = None

	required_cl = 0

	for o, a in opts:
		if o in ("-h", "--help"):
			usage(sys.argv[0])
			sys.exit()
		elif o in ("-i", "--input-file"):
			required_cl += 1
            		avro_file = a
		elif o in ("-s", "--schema"):
			avro_schema_file = a
		else:
			assert False, "unhandled option"

	if (required_cl < 1): 
		print "ERROR: Missing required argument"
		usage(sys.argv[0])
		sys.exit(1)

	if not avro_schema_file:
		reader = DataFileReader(open(avro_file, "r"), DatumReader())
		for datum in reader:
			print datum
		reader.close()
	else:
		reader_schema = open(avro_schema_file, "r")
		avro_schema = reader_schema.read()
		reader_schema.close()
		parsed_avro_schema = avro.schema.parse(avro_schema)

		with open(avro_file, "rb") as reader_data:
			inputio = io.BytesIO(reader_data.read())
			decoder = avro.io.BinaryDecoder(inputio)
			reader = avro.io.DatumReader(parsed_avro_schema)
			while inputio.tell() < len(inputio.getvalue()):
				avro_datum = reader.read(decoder)
				print avro_datum
		reader_data.close()

예제 #9

0

파일 보기

파일: process_user_events_avro.py 프로젝트: tomford/bbm-misc

def listAllUserEvents(filename):

    try:

        reader = DataFileReader(open(filename, "r"), DatumReader())
        for event in reader:

            # Query uuids of events
            print "event id: {0}, event data extra fields: {1}".format(event["uuid"], event["eventData"]["otherEventData"])

        reader.close()
    except IOError:
        print filename + ": Could not open file.  Exiting"
        sys.exit(1)

예제 #10

0

파일 보기

파일: 05.FileProtocolServer.py 프로젝트: team-herring/herring-box

    def handle(self):
        data = self.request.recv(8024).strip()
        data = StringIO(data)

        reader = DataFileReader(data, DatumReader())
        for fileData in reader:
            id = fileData['id']
            data = fileData['data']

            print fileData

            if not fileDict.has_key(id):
                fileDict[id] = open("./" + id, "w")

            f = fileDict[id]

            f.write(data)
            f.flush()
        reader.close()

예제 #11

0

파일 보기

파일: avro_test.py 프로젝트: ivernaloo/avro

def main():
  """Start of execution"""
  #combine the schemas 
  known_schemas = avro.schema.Names()
  types_schema = LoadAvsc("parameter_types.avsc", known_schemas)
  param_schema = LoadAvsc("parameter.avsc", known_schemas)
  print json.dumps(param_schema.to_json(avro.schema.Names()), indent=2) 
  #test the schema works 
  param_file = open("parameters.avro", "w")
  writer = DataFileWriter(param_file, DatumWriter(), param_schema)
  param_1 = {"name": "test", "description":"An Avro test.", "type":"int"}
  param_2 = {"name": "test", "description":"An Avro test.", "type":"boolean"}
  writer.append(param_1)
  writer.append(param_2)
  writer.close()
  reader = DataFileReader(open("parameters.avro", "r"), DatumReader())
  for parameter in reader:
      print parameter
  reader.close()

예제 #12

0

파일 보기

파일: tweetExample.py 프로젝트: atinsood/HESDataAnalyticsFinalProject

def readAndWriteAvro():
    """ Unlike java, avro does not let you generate
        code for Tweet in python. So only way to read and write
        data is without using code generation"""

    #Read the schema
    schema = avro.schema.parse(open("tweet.avsc").read())


    #write some data
    writer = DataFileWriter(open("tweets.avro", "w"), DatumWriter(), schema)
    writer.append({"tweetId": 5, "user": "******", "text" : "Tweeting from python as well"})
    writer.close()

    #read the same data
    tweets = DataFileReader(open("tweets.avro", "r"), DatumReader())
    for tweet in tweets:
        print tweet
    tweets.close()

예제 #13

0

파일 보기

파일: pyavro.py 프로젝트: yuyiguo/WMArchive

def read(fin, fout=None, nrecords=0):
    "Read given avro file according to its schema and dump on stdout its content"
    reader = DataFileReader(open(fin, "r"), DatumReader())
    fobj = open(fout, 'w') if fout else None
    count = 0
    if  fobj:
        fobj.write("[\n")
    for rec in reader:
        if  fobj:
            if  count:
                fobj.write(",\n")
            fobj.write(json.dumps(rec))
        else:
            pprint.pprint(rec)
        if  nrecords and count >= nrecords:
            break
        count += 1
    if  fobj:
        fobj.write("]\n")
        fobj.close()
    reader.close()

예제 #14

0

파일 보기

파일: test_couchbase_destination.py 프로젝트: streamsets/datacollector-tests

def test_data_format_avro(sdc_builder, sdc_executor, couchbase):
    bucket_name = get_random_string(string.ascii_letters, 10)
    document_key = 'id'

    DATA = {
        'name': 'boss',
        'age': 60,
        'emails': ['*****@*****.**', '*****@*****.**'],
        'boss': None
    }
    SCHEMA = {
        'namespace':
        'example.avro',
        'type':
        'record',
        'name':
        'Employee',
        'fields': [{
            'name': 'name',
            'type': 'string'
        }, {
            'name': 'age',
            'type': 'int'
        }, {
            'name': 'emails',
            'type': {
                'type': 'array',
                'items': 'string'
            }
        }, {
            'name': 'boss',
            'type': ['Employee', 'null']
        }]
    }
    cluster = couchbase.cluster

    # Build the pipeline
    builder = sdc_builder.get_pipeline_builder()

    source = builder.add_stage('Dev Raw Data Source').set_attributes(
        data_format='JSON',
        raw_data=json.dumps(DATA),
        stop_after_first_batch=True)

    destination = builder.add_stage('Couchbase', type='destination')
    destination.set_attributes(authentication_mode='USER',
                               bucket=bucket_name,
                               document_key=document_key,
                               data_format='AVRO',
                               avro_schema=json.dumps(SCHEMA),
                               avro_schema_location='INLINE')

    source >> destination

    pipeline = builder.build().configure_for_environment(couchbase)
    sdc_executor.add_pipeline(pipeline)

    try:
        logger.info('Creating %s Couchbase bucket ...', bucket_name)
        couchbase.bucket_manager.create_bucket(
            CreateBucketSettings(name=bucket_name,
                                 bucket_type='couchbase',
                                 ram_quota_mb=256))
        couchbase.wait_for_healthy_bucket(bucket_name)

        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        bucket = cluster.bucket(bucket_name)
        doc_value = bucket.get(document_key).value

        # decode the bytes object returned by Couchbase
        file = BytesIO(doc_value)
        reader = DataFileReader(file, DatumReader())
        records = [record for record in reader]
        assert len(
            records
        ) == 1, 'Number of records stored should equal number of records that entered the pipeline'
        assert records[0] == DATA
        reader.close()
    finally:
        if pipeline and sdc_executor.get_pipeline_status(
                pipeline).response.json().get('status') == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline)
        try:
            logger.info('Deleting %s Couchbase bucket ...', bucket_name)
            couchbase.bucket_manager.drop_bucket(bucket_name)
        except Exception as e:
            logger.error(f"Can't delete bucket: {e}")

예제 #15

0

파일 보기

파일: test_avro.py 프로젝트: eselyavka/python

def readFile():
    reader = DataFileReader(open("part-00000.avro", "r"), DatumReader())
    for user in reader:
        print user
    reader.close()

예제 #16

0

파일 보기

파일: serialize_and_deserialize.py 프로젝트: kn/avro_tutorial

import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter

with open("blog.avsc") as schema_file:
    schema = avro.schema.parse(schema_file.read())

with open("blog.avro", "wb") as out_file:
    writer = DataFileWriter(out_file, DatumWriter(), schema)
    writer.append({
        "title": "Avro is awesome",
        "content": "Let's learn Avro!",
        "is_published": False })
    writer.close()

with open("blog.avro") as in_file:
    reader = DataFileReader(in_file, DatumReader())
    for blog in reader:
        print blog
    reader.close()

예제 #17

0

파일 보기

파일: test_tether_word_count.py 프로젝트: AlexChen12/avro

  def test1(self):
    """
    Run a tethered map-reduce job.

    Assumptions: 1) bash is available in /bin/bash
    """
    from word_count_task import WordCountTask
    from avro.tether import tether_task_runner
    from avro.datafile import DataFileReader
    from avro.io import DatumReader
    import avro

    import subprocess
    import StringIO
    import shutil
    import tempfile
    import inspect

    proc=None

    try:


      # TODO we use the tempfile module to generate random names
      # for the files
      base_dir = "/tmp/test_tether_word_count"
      if os.path.exists(base_dir):
        shutil.rmtree(base_dir)

      inpath = os.path.join(base_dir, "in")
      infile=os.path.join(inpath, "lines.avro")
      lines=["the quick brown fox jumps over the lazy dog",
             "the cow jumps over the moon",
             "the rain in spain falls mainly on the plains"]

      self._write_lines(lines,infile)

      true_counts=self._count_words(lines)

      if not(os.path.exists(infile)):
        self.fail("Missing the input file {0}".format(infile))


      # The schema for the output of the mapper and reducer
      oschema="""
{"type":"record",
 "name":"Pair","namespace":"org.apache.avro.mapred","fields":[
     {"name":"key","type":"string"},
     {"name":"value","type":"long","order":"ignore"}
 ]
}
"""

      # write the schema to a temporary file
      osfile=tempfile.NamedTemporaryFile(mode='w',suffix=".avsc",prefix="wordcount",delete=False)
      outschema=osfile.name
      osfile.write(oschema)
      osfile.close()

      if not(os.path.exists(outschema)):
        self.fail("Missing the schema file")

      outpath = os.path.join(base_dir, "out")

      args=[]

      args.append("java")
      args.append("-jar")
      args.append(os.path.abspath("@TOPDIR@/../java/tools/target/avro-tools-@[email protected]"))


      args.append("tether")
      args.extend(["--in",inpath])
      args.extend(["--out",outpath])
      args.extend(["--outschema",outschema])
      args.extend(["--protocol","http"])

      # form the arguments for the subprocess
      subargs=[]

      srcfile=inspect.getsourcefile(tether_task_runner)

      # Create a shell script to act as the program we want to execute
      # We do this so we can set the python path appropriately
      script="""#!/bin/bash
export PYTHONPATH={0}
python -m avro.tether.tether_task_runner word_count_task.WordCountTask
"""
      # We need to make sure avro is on the path
      # getsourcefile(avro) returns .../avro/__init__.py
      asrc=inspect.getsourcefile(avro)
      apath=asrc.rsplit(os.sep,2)[0]

      # path to where the tests lie
      tpath=os.path.split(__file__)[0]

      exhf=tempfile.NamedTemporaryFile(mode='w',prefix="exec_word_count_",delete=False)
      exfile=exhf.name
      exhf.write(script.format((os.pathsep).join([apath,tpath]),srcfile))
      exhf.close()

      # make it world executable
      os.chmod(exfile,0755)

      args.extend(["--program",exfile])

      print "Command:\n\t{0}".format(" ".join(args))
      proc=subprocess.Popen(args)


      proc.wait()

      # read the output
      with file(os.path.join(outpath,"part-00000.avro")) as hf:
        reader=DataFileReader(hf, DatumReader())
        for record in reader:
          self.assertEqual(record["value"],true_counts[record["key"]])

        reader.close()

    except Exception as e:
      raise
    finally:
      # close the process
      if proc is not None and proc.returncode is None:
        proc.kill()
      if os.path.exists(base_dir):
        shutil.rmtree(base_dir)
      if os.path.exists(exfile):
        os.remove(exfile)

예제 #18

0

파일 보기

파일: typecheck.py 프로젝트: wtanaka/avro-examples

def main():
   known_schemas = avro.schema.Names()

   with open("point.avsc", "rb") as fp:
      point = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas)

   with open("review.avsc", "rb") as fp:
      place = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas)

   with open("place.avsc", "rb") as fp:
      place = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas)

   output = StringIO.StringIO()
   writer = DataFileWriter(output, DatumWriter(), point)
   writer.append({'x': 1.5, 'y': 2.75})
   writer.flush()
   serialized = output.getvalue()
   reader = DataFileReader(StringIO.StringIO(serialized), DatumReader())
   deserialized = tuple(reader)[0]
   assert deserialized['x'] == 1.5
   assert deserialized['y'] == 2.75
   reader.close()
   writer.close()

   try:
      output = StringIO.StringIO()
      writer = DataFileWriter(output, DatumWriter(), point)
      writer.append({'x': 1.5})
      assert False
   except AvroTypeException as e:
      pass

   try:
      output = StringIO.StringIO()
      writer = DataFileWriter(output, DatumWriter(), point)
      writer.append({'x': 1.5, 'y': "wtanaka.com"})
      assert False
   except AvroTypeException as e:
      pass

   output = StringIO.StringIO()
   writer = DataFileWriter(output, DatumWriter(), place)
   writer.append({
         'name': 'wtanaka.com',
         'location': {'x': 1.5, 'y': 2.75}
         })
   writer.flush()
   serialized = output.getvalue()
   reader = DataFileReader(StringIO.StringIO(serialized), DatumReader())
   deserialized = tuple(reader)[0]
   assert deserialized['location']['x'] == 1.5
   assert deserialized['location']['y'] == 2.75
   reader.close()
   writer.close()

   output = StringIO.StringIO()
   writer = DataFileWriter(output, DatumWriter(), place)
   writer.append({
         'name': 'wtanaka.com',
         'location': {'x': 1.5, 'y': 2.75},
         'review': {'rating': 4, 'text': '4 stars would come again'},
         })
   writer.flush()
   serialized = output.getvalue()
   reader = DataFileReader(StringIO.StringIO(serialized), DatumReader())
   deserialized = tuple(reader)[0]
   assert deserialized['location']['x'] == 1.5
   assert deserialized['location']['y'] == 2.75
   reader.close()
   writer.close()

   try:
      output = StringIO.StringIO()
      writer = DataFileWriter(output, DatumWriter(), place)
      writer.append({
            'name': 'wtanaka.com',
            'location': {'x': 1.5, 'y': 2.75},
            'review': {'x': 1.5, 'y': 2.75},
            })
      assert False
   except AvroTypeException as e:
      pass

예제 #19

0

파일 보기

파일: prefilter-egi.py 프로젝트: vrdel/argo-egi-connectors

def main():
    parser = optparse.OptionParser(description="""Filters consumer messages based on various criteria
                                                    (allowed NGIs, service flavours, metrics...)""")
    parser.add_option('-g', dest='gloconf', nargs=1, metavar='global.conf', help='path to global configuration file', type=str)
    group = optparse.OptionGroup(parser, 'Compute Engine usage')
    group.add_option('-d', dest='date', nargs=1, metavar='YEAR-MONTH-DAY')
    parser.add_option_group(group)
    group = optparse.OptionGroup(parser, 'Debugging usage')
    group.add_option('-f', dest='cfile', nargs=1, metavar='consumer_log_YEAR-MONTH-DAY.avro')
    parser.add_option_group(group)
    (options, args) = parser.parse_args()

    global logger
    logger = Logger(os.path.basename(sys.argv[0]))

    prefilter = {'Prefilter': ['ConsumerFilePath', 'PoemExpandedProfiles', 'PoemNameMapping', 'LookbackPoemExpandedProfiles']}
    schemas = {'AvroSchemas': ['Prefilter']}
    output = {'Output': ['Prefilter']}
    confpath = options.gloconf if options.gloconf else None
    cglob = Global(confpath, schemas, output, prefilter)
    global globopts
    globopts = cglob.parse()

    stats = ()

    if options.cfile and options.date:
        parser.print_help()
        raise SystemExit(1)
    elif options.cfile:
        fname = options.cfile
        date = options.cfile.split('_')[-1]
        date = date.split('.')[0]
        date = date.split('-')
    elif options.date:
        date = options.date.split('-')
    else:
        parser.print_help()
        raise SystemExit(1)

    if len(date) == 0 or len(date) != 3:
        logger.error('Consumer file does not end with correctly formatted date')
        parser.print_help()
        raise SystemExit(1)

    year, month, day = date

    # avro files
    if options.cfile:
        inputFile = options.cfile
    else:
        inputFile = gen_fname_repdate(logger, year+'-'+month+'-'+day, globopts['PrefilterConsumerFilePath'.lower()], '')
    outputFile = gen_fname_repdate(logger, year+'_'+month+'_'+day, globopts['OutputPrefilter'.lower()], '')

    try:
        schema = avro.schema.parse(open(globopts['AvroSchemasPrefilter'.lower()]).read())
        writer = DataFileWriter(open(outputFile, "w"), DatumWriter(), schema)
        reader = DataFileReader(open(inputFile, "r"), DatumReader())
    except IOError as e:
        logger.error(str(e))
        raise SystemExit(1)

    # load poem data
    ngis = loadNGIs(year, month, day)
    profiles = loadFilteredProfiles(year, month, day)
    nameMapping = loadNameMapping(year, month, day)

    s = time.time()
    msgs, msgswrit, msgsfilt, falsemonhost, falseroc, falseprofile = prefilterit(reader, writer, ngis, profiles, nameMapping)
    e = time.time()

    logger.info('ExecTime:%.2fs ConsumerDate:%s Read:%d Written:%d Filtered:%d(Monitoring_Host:%d,ROC:%d,ServiceTypes_Metrics:%d)' % (round(e - s, 2), year+'-'+month+'-'+day,
                                                                                    msgs, msgswrit, msgsfilt, falsemonhost, falseroc,
                                                                                    falseprofile))

    reader.close()
    writer.close()

예제 #20

0

파일 보기

파일: test_tether_word_count.py 프로젝트: wangkirin/avro

    def test1(self):
        """
    Run a tethered map-reduce job.

    Assumptions: 1) bash is available in /bin/bash
    """
        from word_count_task import WordCountTask
        from avro.tether import tether_task_runner
        from avro.datafile import DataFileReader
        from avro.io import DatumReader
        import avro

        import subprocess
        import StringIO
        import shutil
        import tempfile
        import inspect

        proc = None

        try:

            # TODO we use the tempfile module to generate random names
            # for the files
            base_dir = "/tmp/test_tether_word_count"
            if os.path.exists(base_dir):
                shutil.rmtree(base_dir)

            inpath = os.path.join(base_dir, "in")
            infile = os.path.join(inpath, "lines.avro")
            lines = [
                "the quick brown fox jumps over the lazy dog",
                "the cow jumps over the moon",
                "the rain in spain falls mainly on the plains"
            ]

            self._write_lines(lines, infile)

            true_counts = self._count_words(lines)

            if not (os.path.exists(infile)):
                self.fail("Missing the input file {0}".format(infile))

            # The schema for the output of the mapper and reducer
            oschema = """
{"type":"record",
 "name":"Pair","namespace":"org.apache.avro.mapred","fields":[
     {"name":"key","type":"string"},
     {"name":"value","type":"long","order":"ignore"}
 ]
}
"""

            # write the schema to a temporary file
            osfile = tempfile.NamedTemporaryFile(mode='w',
                                                 suffix=".avsc",
                                                 prefix="wordcount",
                                                 delete=False)
            outschema = osfile.name
            osfile.write(oschema)
            osfile.close()

            if not (os.path.exists(outschema)):
                self.fail("Missing the schema file")

            outpath = os.path.join(base_dir, "out")

            args = []

            args.append("java")
            args.append("-jar")
            args.append(
                os.path.abspath(
                    "@TOPDIR@/../java/tools/target/avro-tools-@[email protected]"
                ))

            args.append("tether")
            args.extend(["--in", inpath])
            args.extend(["--out", outpath])
            args.extend(["--outschema", outschema])
            args.extend(["--protocol", "http"])

            # form the arguments for the subprocess
            subargs = []

            srcfile = inspect.getsourcefile(tether_task_runner)

            # Create a shell script to act as the program we want to execute
            # We do this so we can set the python path appropriately
            script = """#!/bin/bash
export PYTHONPATH={0}
python -m avro.tether.tether_task_runner word_count_task.WordCountTask
"""
            # We need to make sure avro is on the path
            # getsourcefile(avro) returns .../avro/__init__.py
            asrc = inspect.getsourcefile(avro)
            apath = asrc.rsplit(os.sep, 2)[0]

            # path to where the tests lie
            tpath = os.path.split(__file__)[0]

            exhf = tempfile.NamedTemporaryFile(mode='w',
                                               prefix="exec_word_count_",
                                               delete=False)
            exfile = exhf.name
            exhf.write(
                script.format((os.pathsep).join([apath, tpath]), srcfile))
            exhf.close()

            # make it world executable
            os.chmod(exfile, 0755)

            args.extend(["--program", exfile])

            print "Command:\n\t{0}".format(" ".join(args))
            proc = subprocess.Popen(args)

            proc.wait()

            # read the output
            with file(os.path.join(outpath, "part-00000.avro")) as hf:
                reader = DataFileReader(hf, DatumReader())
                for record in reader:
                    self.assertEqual(record["value"],
                                     true_counts[record["key"]])

                reader.close()

        except Exception as e:
            raise
        finally:
            # close the process
            if proc is not None and proc.returncode is None:
                proc.kill()
            if os.path.exists(base_dir):
                shutil.rmtree(base_dir)
            if os.path.exists(exfile):
                os.remove(exfile)

예제 #21

0

파일 보기

import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter

schema = avro.schema.parse(open('./schema.avsc', 'rb').read())

# Create an avro file

writer = DataFileWriter(open('user.avro', 'wb'), DatumWriter(), schema)
writer.append({'name': 'Eric', 'favorite_number': 128})
writer.append({
    'name': 'Tanya',
    'favorite_color': 'red',
    'favorite_number': 383
})
writer.close()

# Now read that file

reader = DataFileReader(open('user.avro', 'rb'), DatumReader())
for user in reader:
    print user
reader.close()