Esempio n. 1
0
def write_avro(rows, file_out, schema_path):
    schema = avro.schema.parse(open(schema_path, "rb").read())
    writer = DataFileWriter(open(file_out, "wb"), DatumWriter(), schema)
    for line in rows:
        print("INPUT LINE: ", line)
        writer.append({"name": line[0], "sex": line[1], "count": line[2], "year": line[3]})
    writer.close()
  def _write_lines(self,lines,fname):
    """
    Write the lines to an avro file named fname

    Parameters
    --------------------------------------------------------
    lines - list of strings to write
    fname - the name of the file to write to.
    """
    import avro.io as avio
    from avro.datafile import DataFileReader,DataFileWriter
    from avro import schema

    #recursively make all directories
    dparts=fname.split(os.sep)[:-1]
    for i in range(len(dparts)):
      pdir=os.sep+os.sep.join(dparts[:i+1])
      if not(os.path.exists(pdir)):
        os.mkdir(pdir)


    with file(fname,'w') as hf:
      inschema="""{"type":"string"}"""
      writer=DataFileWriter(hf,avio.DatumWriter(inschema),writers_schema=schema.parse(inschema))

      #encoder = avio.BinaryEncoder(writer)
      #datum_writer = avio.DatumWriter()
      for datum in lines:
        writer.append(datum)

      writer.close()
Esempio n. 3
0
def generate_sample_datasets (host_ips, metric_ids, year, month, day, hour):
    avro_schema = ''
    #load data from hdfs
    cat = subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-cat', '/user/pnda/PNDA_datasets/datasets/.metadata/schema.avsc'], stdout=subprocess.PIPE)
    for line in cat.stdout:
        avro_schema = avro_schema + line
    schema = avro.schema.parse(avro_schema)
    bytes_writer = io.BytesIO()
    encoder = avro.io.BinaryEncoder(bytes_writer)
    #create hdfs folder structure
    dir = create_hdfs_dirs (year, month, day, hour)
    filename = str(uuid.uuid4()) + '.avro'
    filepath = dir + filename
    tmp_file = '/tmp/' + filename
    
    writer = DataFileWriter(open(tmp_file, "w"), DatumWriter(), schema)
    
    start_dt = datetime.datetime(year, month, day, hour, 0, 0) 
    start_ts = int(time.mktime(start_dt.timetuple()))
    end_dt = start_dt.replace(hour=hour+1)
    end_ts = int(time.mktime(end_dt.timetuple()))

    for ts in xrange(start_ts, end_ts, 1):
        #generate random pnda record on per host ip basis
        for host_ip in host_ips:
           record = {}
           record['timestamp'] = (ts * 1000)
           record['src'] = 'test'
           record['host_ip'] = host_ip
           record['rawdata'] = generate_random_metrics(metric_ids)
           #encode avro
           writer.append(record)
    writer.close()
    subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-copyFromLocal', tmp_file, dir])
    return filepath
    def outputManager(self, output, key, bucket, output_format='json'):
        if output_format == 'avro':
            avro_schema = avro.schema.Parse(json.dumps(schema[self.endpoint]))
            writer = DataFileWriter(open("{0}.avro".format(self.endpoint), "wb"),
                                    DatumWriter(),
                                    avro_schema)
            for record in output:
                writer.append(record)

            writer.close()

            output_file = "{0}.avro".format(self.endpoint)

        elif output_format == 'json':
            tmp = NamedTemporaryFile("w")

            for row in output:
                tmp.write(json.dumps(row) + '\n')

            tmp.flush()

            output_file = tmp.name

        s3 = S3Hook(s3_conn_id=self.s3_conn_id)

        s3.load_file(
            filename=output_file,
            key=self.s3_key,
            bucket_name=self.s3_bucket,
            replace=True
        )
Esempio n. 5
0
def dump_data(bso_number, schema, dsn, args):
    offset = args.offset or 0
    total_rows = 0
    # things time out around 1_500_000 rows.
    db = conf_db(dsn)
    out_file = args.output.rsplit('.', 1)
    row_count = count_rows(db, bso_number)
    for chunk in range(max(1, math.trunc(math.ceil(row_count / MAX_ROWS)))):
        print("Dumping {} rows from bso#{} into chunk {}".format(
            row_count, bso_number, chunk))
        out_file_name = "{}_{}_{}.{}".format(out_file[0], bso_number,
                                             hex(chunk), out_file[1])
        writer = DataFileWriter(open(out_file_name, "wb"), DatumWriter(),
                                schema)
        rows = dump_rows(bso_number=bso_number,
                         chunk_offset=offset,
                         db=db,
                         writer=writer,
                         args=args)
        writer.close()
        if rows == 0:
            break
        offset = offset + rows
        chunk += 1
    return rows
Esempio n. 6
0
    def encode(self, obj: BaseRecord) -> bytes:
        """ Encode *BaseHandlerEvent / BaseHandlerCommand / BaseHandlerResult* to bytes format

        This function is used by kafka-python

        Args:
            obj (BaseModel): *BaseHandlerEvent / BaseHandlerCommand / BaseHandlerResult*

        Raises:
            MissingEventClass: can’t find BaseModel in own registered BaseModel list (self._schema)
            AvroEncodeError: fail to encode BaseModel to bytes

        Returns:
            bytes: BaseModel in bytes
        """
        try:
            schema = self._schemas[obj.event_name()]
        except KeyError as err:
            self.logger.exception('%s', err.__str__())
            raise MissingEventClass

        try:
            output = BytesIO()
            writer = DataFileWriter(output, DatumWriter(), schema)
            writer.append(obj.to_dict())
            writer.flush()
            encoded_event = output.getvalue()
            writer.close()
        except AvroTypeException as err:
            self.logger.exception('%s', err.__str__())
            raise AvroEncodeError
        return encoded_event
Esempio n. 7
0
def serializeDataToOCFFile(schemaFile,outputFile,dataToSerialize):
    logging.debug("Parsing in avro schema:"+schemaFile)
    schema=parse_schema(schemaFile)
    logging.debug("Writing avro data to:"+outputFile)
    writer = DataFileWriter(open(outputFile, "w"), DatumWriter(), schema)
    writer.append(dataToSerialize)
    writer.close()
Esempio n. 8
0
class AvroWriter(object):
    """ AvroWriter """
    def __init__(self, schema, outfile):
        self.schema = schema
        self.outfile = outfile
        self.datawrite = None
        self.avrofile = None
        self._load_datawriter()

    def _load_datawriter(self):
        try:
            lschema = load_schema(self.schema)
            self.avrofile = open(self.outfile, 'w+b')
            self.datawrite = DataFileWriter(self.avrofile, DatumWriter(), lschema)
        except Exception:
            return False

        return True

    def write(self, data):
        try:
            if (not self.datawrite or not self.avrofile):
                raise AvroWriteException('AvroFileWriter not initalized')

            for elem in data:
                self.datawrite.append(elem)

            self.datawrite.close()
            self.avrofile.close()

        except Exception as e:
            return False, e

        return True, None
Esempio n. 9
0
def serializeDataToOCFFile(schemaFile, outputFile, dataToSerialize):
    logging.debug("Parsing in avro schema:" + schemaFile)
    schema = parse_schema(schemaFile)
    logging.debug("Writing avro data to:" + outputFile)
    writer = DataFileWriter(open(outputFile, "w"), DatumWriter(), schema)
    writer.append(dataToSerialize)
    writer.close()
Esempio n. 10
0
def hello_gcs(event, context):
    # set storage client
    client = storage.Client()

    # get bucket
    bucket = client.get_bucket(bucket_name)

    # get the data
    print('URL: {}'.format(url))
    response = urllib.request.urlopen(url)
    data = json.loads(response.read())

    # remove unneeded data AND convert to bytes
    #small_data = json.dumps( data['observations'] ).encode('utf-8')

    # write to local file
    file_name = '{}.{}'.format(series_id, file_type)
    local_path = '/tmp/{}'.format(file_name)
    writer = DataFileWriter(open(local_path, "wb"), DatumWriter(), schema)
    for record in data['observations']:
        days_since_epoch, data_point = convert_data_types(record)
        writer.append({"date": days_since_epoch, "value": data_point})
    writer.close()

    # set Blob
    file_name = '{}_{}.{}'.format(series_id, get_datetime(), file_type)
    blob = storage.Blob(file_name, bucket)

    # upload the file to GCS
    blob.upload_from_filename(local_path)

    print('Event ID: {}'.format(context.event_id))
    print('Event type: {}'.format(context.event_type))
    print("""This Function was triggered by messageId {} published at {}
  """.format(context.event_id, context.timestamp))
Esempio n. 11
0
def save_records_to_avrofile(
        flows_towrite,
        fn_output,
        avro_schema=cons.DEFAULT_AVRO_NFCAP_FLOWS_SCHEMA_FILEPATH):
    """
    Write to an AVRO file a given a dictionary or a list of dicts containing flow records.
    :param flows_towrite: dict or list of flow records.
    :param fn_output: .avro output filepath and name.
    :param avro_schema: schema to write the records to an .avro file.
    :return: none
    """
    # load schema
    schema = avro.schema.parse(open(avro_schema, "rb").read())

    # create object writer
    writer = DataFileWriter(open(fn_output, "wb"),
                            DatumWriter(),
                            schema,
                            codec="deflate")

    # write records to avro file output
    if type(flows_towrite) is dict:
        for k, v in flows_towrite.items():
            writer.append(v)
        writer.close()

    if type(flows_towrite) is list:
        for record in flows_towrite:
            writer.append(record)
        writer.close()
Esempio n. 12
0
def write_data_to_avro(raw_data, data_type):
    
    data_folder = Path('avro')
    avro_file = data_type + '.avro'
    avro_file_path = data_folder / avro_file
    avsc_file = data_type + '.avsc'
    avsc_file_path = data_folder / avsc_file

    schema = avro.schema.Parse(open(avsc_file_path.resolve(), "rb").read())

    writer = DataFileWriter(open(avro_file_path.resolve(), "wb"), DatumWriter(), schema)

    for _ , record in raw_data.iterrows():
        dict = record.to_dict()

        if data_type == "stops":
            dict['stop_lat_lon'] = {'stop_lon': dict['stop_lat_lon'].x, 'stop_lat': dict['stop_lat_lon'].y}
        '''
        if data_type == "stop_times":
            #del dict['arrival_time']
            #del dict['departure_time']
            del dict['stop_id']
            del dict['stop_sequence']
            del dict['pickup_type']
            del dict['drop_off_type']
            del dict['timepoint']
        '''
        writer.append(dict)
    
    writer.close()
    '''
Esempio n. 13
0
def handle_avro_client_print_to_file(connection, address):

    schema = avro.schema.Parse(open("schema/addressbook.avsc", "rb").read())

    data = connection.recv(4)

    message_length, = struct.unpack('>I', data)

    message = connection.recv(message_length)

    message_buf = io.BytesIO(message)
    reader = avro.datafile.DataFileReader(message_buf, avro.io.DatumReader())

    # Create a data file using DataFileWriter

    dataFile = open("schema/addressbook.avro", "wb")

    writer = DataFileWriter(dataFile, DatumWriter(), schema)

    for thing in reader:
        writer.append(thing)
    reader.close()

    writer.close()
    return (len(message))
Esempio n. 14
0
    def write_to_hdfs(rows: List[Tuple[str, str]]):
        conn: Connection = Connection.get_connection_from_secrets('local_hdfs')
        uri = conn.get_uri()
        pat = re.compile("http://(\w+(:\w+)?)?@")
        print(conn.get_uri())

        uri = pat.sub("http://", uri)
        print(uri)
        print(conn.login)
        client = InsecureClient(uri, user=conn.login)
        sch = avro.schema.make_avsc_object({
            'type':'record',
            'name':'Video',
            'fields': [
                {'type': {'type': 'string', 'avro.java.string': 'String'}, 'name': 'title'},
                {'type': ["null", {'type': 'string', 'avro.java.string': 'String'}], 'name': 'description'},
            ]
        })
        local_file_name = 'videos.avro'
        writer = DataFileWriter(open(local_file_name, "wb"), DatumWriter(), sch)
        for row in rows:
            print(row)
            writer.append({"title":row[0], "description":row[1]})
        writer.close()
        client.upload('/tmp/videos.avro', local_file_name)
Esempio n. 15
0
 def _produce_test_input(self):
     schema = avro.schema.parse("""
     {
         "name": "TestQueryTask_record",
         "type": "record",
         "doc": "The description",
         "fields": [
             {"name": "col0", "type": "int", "doc": "The bold"},
             {"name": "col1", "type": {
                 "name": "inner_record",
                 "type": "record",
                 "doc": "This field shall be an inner",
                 "fields": [
                     {"name": "inner", "type": "int", "doc": "A inner field"},
                     {"name": "col0", "type": "int", "doc": "Same name as outer but different doc"},
                     {"name": "col1", "type": ["null", "string"], "default": null, "doc": "Nullable primitive"},
                     {"name": "col2", "type": ["null", {
                         "type": "map",
                         "values": "string"
                     }], "default": null, "doc": "Nullable map"}
                 ]
             }, "doc": "This field shall be an inner"},
             {"name": "col2", "type": "int", "doc": "The beautiful"},
             {"name": "col3", "type": "double"}
         ]
     }""")
     self.addCleanup(os.remove, "tmp.avro")
     writer = DataFileWriter(open("tmp.avro", "wb"), DatumWriter(), schema)
     writer.append({'col0': 1000, 'col1': {'inner': 1234, 'col0': 3000}, 'col2': 1001, 'col3': 1.001})
     writer.close()
     self.gcs_client.put("tmp.avro", self.gcs_dir_url + "/tmp.avro")
Esempio n. 16
0
def score(graphs, schema, url, port):
    """
    graphs is expected to be a list of dictionaries, where each entry in the 
    list represents a graph with 
    * key idx -> index value
    * key nodes -> list of ints representing vertices of the graph
    * key edges -> list of list of ints representing edges of graph
    """

    stream = BufferedWriter(BytesIO())
    writer = DataFileWriter(stream, avro.io.DatumWriter(), schema)
    # writer = DataFileWriter(open("imdb-graph.avro", "wb"), DatumWriter(), schema)
    for graph in graphs:
        writer.append({
            "edges": graph["edges"],
            "vertices": graph["vertices"],
            "index": graph["idx"],
            "label": graph.get("label")
        })
        writer.flush()
    raw_bytes = stream.raw.getvalue()
    writer.close()

    url = "{}:{}/predictUnstructured/?ret_mode=binary".format(
        url.strip("/"), port)

    payload = raw_bytes
    headers = {'Content-Type': 'application/octet-stream'}

    response = requests.request("POST", url, headers=headers, data=payload)

    return response
Esempio n. 17
0
def export(table, args):
    dest = args.dest
    print('{} exporting table {}...'.format(
        strftime("%H:%M:%S"), table))
    if exp_dict[table] is None:
        global cnxpool
        cnx = cnxpool.get_connection()
        writer = None
        try:
            query = "SELECT * from {}".format(table)
            cursor = cnx.cursor(dictionary=True)
            cursor.execute(query)
            rows = cursor.fetchall()
            schema = avro.schema.parse(
                open(os.path.join("schema", "{}.avsc".format(table)), "rb").read())
            file_path = os.path.join(dest, "{}.avro".format(table))
            print('{} exporting to {}'.format(
                strftime("%H:%M:%S"), file_path))
            writer = DataFileWriter(
                open(file_path, "wb"), DatumWriter(), schema)
            for i, row in enumerate(rows):
                if i != 0 and i % 5000 == 0:
                    print('{} {} records exported...'.format(
                        strftime("%H:%M:%S"), i))
                writer.append(row)
            cursor.close()
        except:
            print(sys.exc_info()[0])
            raise
        finally:
            cnx.close()
            if writer:
                writer.close()
    else:
        exp_dict[table].export(table, dest, args)
Esempio n. 18
0
def testAppend(filename):
    fd = open(filename, 'a+b')
    datum_writer = DatumWriter()
    fwriter = DataFileWriter(fd, datum_writer)
    for i in xrange(10, 20):
        fwriter.append(_makeTestPerson(i))
    fwriter.close()
Esempio n. 19
0
  def write(self, format):
    time_start = time.time()

    if format == 'json' or format == 'jsch':
      with open('./output/output.json', 'w') as file:
        for base_person_obj in self._base_person_list:
          file.write(json.dumps(self._get_json_person(base_person_obj), separators=(',', ':')))
        # file.write(json.dumps(self._data_dict, separators=(',', ':')))

    elif format == 'avro':
      writer = DataFileWriter(open('./output/output.avro', 'wb'), DatumWriter(), self._schema_avro)
      for user in self._data_dict:
        writer.append(user)
      writer.close()

    elif format == 'protobuf':
      with open('./output/output.pb', 'wb') as file:
        for base_person_obj in self._base_person_list:
          protobuf_person = self._get_proto_buf_person(base_person_obj)
          file.write(protobuf_person.SerializeToString())

    elif format == 'gzjson':
      with gzip.open('./output/output.jsz', 'wb') as file:
        file.write(json.dumps(self._data_dict, separators=(',', ':')))

    time_end = time.time()

    return time_end - time_start
Esempio n. 20
0
    def ExportToBin(self, data, schema=None) -> tuple:
        '''
        Exporta objeto data utilizando o schema informado em formato binário (bytes)
        '''
        if not schema == None:
            pschema = self._parseschema(schema)
            if pschema[0]:
                schema = self._data['schema']
            else:
                return pschema

        else:
            schema = self._data['schema']

        if not type(schema) is avro.schema.RecordSchema:
            schema = None
        try:
            with tempfile.SpooledTemporaryFile(suffix='.avro') as tmp:
                writer = DataFileWriter(tmp, DatumWriter(), schema)
                if not data is list:
                    writer.append(data)
                else:
                    for d in data:
                        writer.append(d)
                writer.flush()
                tmp.seek(0)
                export_bin = tmp.read()
                writer.close()
                tmp.close()
                self._data['data'] = export_bin
            return (True, export_bin, self.getSchemaInfos())
        except Exception as e:
            return (False, str(e), self.getSchemaInfos())
Esempio n. 21
0
def _exp_wcctrn(p):
    global cnxpool, count, file_path, schema
    flag, dest = p
    print('{} [{}] exporting {}...'.format(strftime("%H:%M:%S"), os.getpid(),
                                           flag))
    cnx = cnxpool.get_connection()
    writer = None
    _schema = None
    if file_path is None or count >= parallel_threshold:
        file_path = os.path.join(
            dest, "wcc_trn", "{}_{}.avro".format(os.getpid(),
                                                 strftime("%Y%m%d_%H%M%S")))
        print('{} allocating new file {}...'.format(strftime("%H:%M:%S"),
                                                    file_path))
        count = 0
        _schema = schema
    try:
        cursor = cnx.cursor(dictionary=True, buffered=True)
        cursor.execute("SELECT * from wcc_trn where flag = %s", (flag, ))
        rows = cursor.fetchall()
        total = cursor.rowcount
        cursor.close()
        writer = DataFileWriter(open(file_path, "ab+"), DatumWriter(), _schema)
        for row in rows:
            writer.append(row)
        count += total
    except:
        print(sys.exc_info()[0])
        raise
    finally:
        cnx.close()
        if writer:
            writer.close()
def produce_kafka_messages(topic, cluster, message, data_format):
    """Send basic messages to Kafka"""
    # Get Kafka producer
    producer = cluster.kafka.producer()

    basic_data_formats = ['XML', 'CSV', 'SYSLOG', 'NETFLOW', 'COLLECTD', 'BINARY', 'LOG', 'TEXT', 'JSON']

    # Write records into Kafka depending on the data_format.
    if data_format in basic_data_formats:
        producer.send(topic, message)

    elif data_format == 'WITH_KEY':
        producer.send(topic, message, key=get_random_string(string.ascii_letters, 10).encode())

    elif data_format == 'AVRO':
        writer = avro.io.DatumWriter(avro.schema.Parse(json.dumps(SCHEMA)))
        bytes_writer = io.BytesIO()
        encoder = avro.io.BinaryEncoder(bytes_writer)
        writer.write(message, encoder)
        raw_bytes = bytes_writer.getvalue()
        producer.send(topic, raw_bytes)

    elif data_format == 'AVRO_WITHOUT_SCHEMA':
        bytes_writer = io.BytesIO()
        datum_writer = avro.io.DatumWriter(avro.schema.Parse(json.dumps(SCHEMA)))
        data_file_writer = DataFileWriter(writer=bytes_writer, datum_writer=datum_writer,
                                          writer_schema=avro.schema.Parse(json.dumps(SCHEMA)))
        data_file_writer.append(message)
        data_file_writer.flush()
        raw_bytes = bytes_writer.getvalue()
        data_file_writer.close()
        producer.send(topic, raw_bytes)

    producer.flush()
Esempio n. 23
0
def save_avro(data, file_name='data.avro', test=True):
    import json

    import avro.schema

    from avro.datafile import DataFileWriter
    from avro.io import DatumWriter

    schema_path = str(DATA_ROOT / 'schemas.avsc')
    with open(schema_path) as f:
        schema = avro.schema.SchemaFromJSONData(json.load(f))

    if test:
        file_name = "{}.{}".format(file_name, os.getpid())

    path = str(DATA_ROOT / file_name)
    writer = DataFileWriter(open(path, "wb"), DatumWriter(), schema)

    try:
        for datum in data:
            writer.append(datum)
    finally:
        writer.close()

        if test:
            os.remove(path)
Esempio n. 24
0
  def _write_lines(self,lines,fname):
    """
    Write the lines to an avro file named fname

    Parameters
    --------------------------------------------------------
    lines - list of strings to write
    fname - the name of the file to write to.
    """
    import avro.io as avio
    from avro.datafile import DataFileReader,DataFileWriter
    from avro import schema

    #recursively make all directories
    dparts=fname.split(os.sep)[:-1]
    for i in range(len(dparts)):
      pdir=os.sep+os.sep.join(dparts[:i+1])
      if not(os.path.exists(pdir)):
        os.mkdir(pdir)


    with file(fname,'w') as hf:
      inschema="""{"type":"string"}"""
      writer=DataFileWriter(hf,avio.DatumWriter(inschema),writers_schema=schema.parse(inschema))

      #encoder = avio.BinaryEncoder(writer)
      #datum_writer = avio.DatumWriter()
      for datum in lines:
        writer.append(datum)

      writer.close()
Esempio n. 25
0
def _loadAvro(config, superSchema, daysArray):
    print("**********************Loading ForecastDataAvro****************")
    autGenSchemaFile = config["ETL"]["Extract"]["AutGenSchemaFile"]
    forecastAvroFile = config["ETL"]["Load"]["Avro"]["File"]
    dWHForecastPath = config["ETL"]["Load"]["AvgData"]["DWHForecastPath"]
    
    dayAvroSchema = _autogenerateSchema(superSchema)

    with open(dWHForecastPath+autGenSchemaFile, "w") as file:
        file.write(json.dumps(dayAvroSchema, indent=4))
    # create avro.schema from json schema
    dayAvroSchemaString = json.dumps(dayAvroSchema)
    schema = avro.schema.Parse(dayAvroSchemaString)

    avroFile = dWHForecastPath + forecastAvroFile
    # create a writer for DWH
    writer = DataFileWriter(open(avroFile, "wb"),
                            DatumWriter(), schema)

    # append each day
    for day in daysArray:
        # pp.pprint(day)
        writer.append(day)

    # close writer
    writer.close()
    # pp.pprint(writer)
    _readAvro(avroFile)
Esempio n. 26
0
  def position_sorter(key_val, output_dir):
    from apache_beam.io.gcp import gcsio
    import avro.schema
    from avro.datafile import DataFileWriter
    from avro.io import DatumWriter
    from itertools import groupby

    key = key_val[0]
    vals = list(key_val[1])
    vals = sorted(vals, key=lambda x: int(x['position']))

    out_file_path = output_dir.get() + "{:06d}.avro".format(key)
    out_file = gcsio.GcsIO().open(out_file_path, 'wb')

    schema = avro.schema.parse(schema_string)
    writer = DataFileWriter(out_file, DatumWriter(), schema)

    def clean_record(record):
      cleaned = {k:v for (k,v) in record.items() if v is not None}
      cleaned.pop('position', None)
      return cleaned

    #for key, group in groupby(things, lambda x: x[0]):
    for position, values in groupby(vals, lambda x: int(x['position'])):
      cleaned_values = [ clean_record(record) for record in values]
      writer.append({"position": position, "values" : cleaned_values})

    writer.close()

    return out_file_path
Esempio n. 27
0
class AvroRecordWriter(TrivialRecordWriter):
    def __init__(self, simulator, stream):
        super(AvroRecordWriter, self).__init__(simulator, stream)

        self.deserializers = {}
        schema = None
        if self.simulator.avro_output_key_schema:
            self.deserializers['k'] = AvroDeserializer(self.simulator.avro_output_key_schema)
            schema = avro.schema.parse(self.simulator.avro_output_key_schema)

        if self.simulator.avro_output_value_schema:
            self.deserializers['v'] = AvroDeserializer(self.simulator.avro_output_value_schema)
            schema = avro.schema.parse(self.simulator.avro_output_value_schema)

        if self.simulator.avro_output == 'kv':
            schema_k_parsed = avro.schema.parse(self.simulator.avro_output_key_schema)
            schema_v_parsed = avro.schema.parse(self.simulator.avro_output_value_schema)

            schema_k = json.loads(self.simulator.avro_output_key_schema)
            schema_k.pop('namespace', None)
            schema_v = json.loads(self.simulator.avro_output_value_schema)
            schema_v.pop('namespace', None)

            schema = {
                'type': 'record',
                'name': 'kv',
                'fields': [
                   {'name': 'key', 'type': schema_k},
                   {'name': 'value', 'type': schema_v if schema_k_parsed.fullname != schema_v_parsed.fullname
                   else schema_k_parsed.name}
                ]
            }
            schema = avro.schema.parse(json.dumps(schema))

        self.writer = DataFileWriter(self.stream, DatumWriter(), schema)

    def send(self, cmd, *vals):
        if cmd == 'done':
            self.writer.close()
        super(AvroRecordWriter, self).send(cmd, *vals)

    def output(self, key, value):
        if self.simulator.avro_output == 'k':
            obj_to_append = self.deserializers['k'].deserialize(key)
        elif self.simulator.avro_output == 'v':
            obj_to_append = self.deserializers['v'].deserialize(value)
        else:
            obj_to_append = {
                'key': self.deserializers['k'].deserialize(key),
                'value': self.deserializers['v'].deserialize(value)
            }
        self.writer.append(obj_to_append)

    def close(self):
        try:
            self.writer.close()
        except ValueError:  # let's ignore if already closed
            pass
        self.stream.close()
Esempio n. 28
0
def gen_avro(filename):
    schema = avro.schema.parse(SCHEMA)
    fo = open(filename, "wb")
    writer = DataFileWriter(fo, DatumWriter(), schema)
    for record in looney_records():
        writer.append(record)
    writer.close()
    fo.close()
Esempio n. 29
0
 def encode(self, raw_data):
     byte_stream = BytesIO()
     writer = DataFileWriter(byte_stream, DatumWriter(), self._schema)
     writer.append(raw_data)
     writer.flush()
     serialized_data = byte_stream.getvalue()
     writer.close()
     return serialized_data
Esempio n. 30
0
def gen_avro(filename):
    schema = avro.schema.parse(SCHEMA)
    fo = open(filename, "wb")
    writer = DataFileWriter(fo, DatumWriter(), schema)
    for record in looney_records():
        writer.append(record)
    writer.close()
    fo.close()
Esempio n. 31
0
    def run(self):
        # for normalizing alcohol
        minimum, maximum, average = 100, 0, 0

        with open('raw.csv', 'r') as fd:
            csv_reader = csv.reader(fd, delimiter=',')

            collection = {}
            for i, row in enumerate(csv_reader):
                desc = row[3].lower().replace('.', '').replace(',', '')

                alc = float(row[-1])
                if alc < minimum:
                    minimum = alc
                if alc > maximum:
                    maximum = alc
                average += alc

                # Remove gifts or items without description
                if 'engin' in desc:
                    continue

                if 'gjafa' in desc or 'gjafa' in row[0]:
                    continue

                if 'öskju' in desc or 'öskju' in row[0]:
                    continue

                if 'flöskur m/glasi' in desc or 'kútur' in row[0]:
                    continue

                features = self.parse(desc.split(), row[0])
                features['alcohol'] = alc
                collection[row[0]] = features

        average = average / (i + 1)

        with open('beers.avsc', 'r') as fd:
            schema = avro.schema.Parse(fd.read())

        with open('beers.avro', 'wb') as fd:
            writer = DataFileWriter(fd, DatumWriter(), schema)

            denominator_alc = maximum - minimum

            for k, v in collection.items():
                v['bitterness'] = self.BITTERNESS['class'][
                    v['bitterness']] / self.BITTERNESS['maximum']
                v['color'] = self.COLOR['class'][
                    v['color']] / self.COLOR['maximum']
                v['clarity'] = self.CLARITY['class'][
                    v['clarity']] / self.CLARITY['maximum']
                v['sweetness'] = self.SWEETNESS['class'][
                    v['sweetness']] / self.CLARITY['maximum']
                v['alcohol'] = (v['alcohol'] - minimum) / denominator_alc
                v['name'] = k
                writer.append(v)
            writer.close()
Esempio n. 32
0
def _create_avro_file(schema, items, file_prefix):
    _, result_file_path = tempfile.mkstemp(prefix=file_prefix, suffix='.avro')
    parsed_schema = avro.schema.Parse(schema)
    with open(result_file_path, 'wb') as f:
        writer = DataFileWriter(f, DatumWriter(), parsed_schema)
        for s in items:
            writer.append(s)
        writer.close()
    return result_file_path
Esempio n. 33
0
def gen_single_day_data(date, schema):
    writer = DataFileWriter(open("events2-{}.avro".format(date), "w"), DatumWriter(), schema)
    N = 10 ** 5
    for i in xrange(0, N):
        tags = ["t{}".format(random.randint(1, 10)) for x in range(0, 4)]
        (tag1, tag2, tag3, tag4) = tags
        cookie = 'CK.{}'.format(random.randint(1, 10 ** 5))
        writer.append({"tag1":tag1, "tag2":tag2, "tag3": tag3, "tag4":tag4, "date":date, "cookie":cookie, "count": 1})
    writer.close()
Esempio n. 34
0
def testWrite(filename):
    schema_object = avro.schema.parse(TEST_SCHEMA)

    fd = open(filename, 'wb')
    datum_writer = DatumWriter()
    fwriter = DataFileWriter(fd, datum_writer, schema_object)
    for i in xrange(10):
        fwriter.append(_makeTestPerson(i))
    fwriter.close()
Esempio n. 35
0
class Avro_Merger(object):
    _merge_started = False
    _avro_extention = '.avro'
    _avro_stats_record = None


    def __init__(self, path, new_filename):
        try:
            self._avro_files = filter(lambda x: x.endswith(self._avro_extention), iter(os.listdir(path)))
            schema = avro.schema.parse(open(schema_file).read())
            self._writter = DataFileWriter(open(output_file, 'w'), DatumWriter(), schema, 'deflate')
        except Exception as e:
            raise avro.schema.AvroException(e)
            sys.exit(1)


    def flog_metadata_handler(func):
        """ This is a decorator that handles avro meta data as well as very last stats record 
            in each file during merging
        """    
        def wrapper(self, avro_records):
            """ Wrapper method for consuming flog avro file
            """
            # Handle meta data
            if self._writter.tell() != 0:  # TODO, need to fix this
                next(avro_records)

            # Handle stats line
            self._avro_stats_record = deque(avro_records, maxlen=1).pop()

            func(avro_records)

        return wrapper
        

    @flog_metadata_handler
    def consume_avro(self, avro_records):
        """ Write the avro data from the butter to file
        """
        map(self._writter.append, iter(self._avro_record))

    
    def merge(self):
        """ Loop through the avros and merge each file
        """
        for file_ in self._avro_files:
            try:
                avro_records = DataFileReader(open(os.path.join(input_dir, file_), "r"), DatumReader())
            except Exception as e:
                raise avro.schema.AvroException(e)

            # Consume the records!
            self.consume_avro(avro_records)

        # Write stats data to the last of the file
        self._writter.append(self._avro_stats_record)
        self._writter.close()
Esempio n. 36
0
class AvroFileWriter(Writer):
    def __init__(self, schemaFile, avroFile):
        self.schema = avro.schema.Parse(open(schemaFile, "rb").read())
        self.writer = DataFileWriter(open(avroFile, "wb"), DatumWriter(), self.schema)
    def write(self, obj):
        self.writer.append(obj);

    def close(self):
        self.writer.close()
Esempio n. 37
0
    def check_schema(self, data, schema_path):
        schema = avro.schema.Parse(
            open(schema_path, "rb").read().decode("utf-8"))

        writer = DataFileWriter(open('_test.avro', "wb"), DatumWriter(),
                                schema)

        writer.append(data)
        writer.close()
Esempio n. 38
0
def create_archive(basedir, destdir):
    all_files = []
    all_dirs = []

    # make a snapshot in case the output directory is the bundle source - so we don't recursively bundle the output
    for path, dirs, files in os.walk(basedir):
        for d in dirs:
            dir = os.path.join(path, d)
            all_dirs.append(dir)
        for f in files:
            file = os.path.join(path, f)
            all_files.append(file)

    schema = avro.schema.parse(
        open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "avro-schemas.json")).read())
    fileprefix = time.strftime("%Y%m%d-%H%M%S")
    avrofile = fileprefix + "-part-0001.avro"
    iteration = 1

    fd = open(os.path.join(destdir, avrofile), 'wb')
    datum = avro.io.DatumWriter()
    writer = DataFileWriter(fd, datum, schema, codec='deflate')
    try:
        for d in all_dirs:
            val = makedir(os.path.basename(os.path.normpath(d)),
                          os.path.relpath(d, basedir))
            writer.append(val)

        for f in all_files:
            for sibling, numsiblings, chunk in get_file_chunks(f):
                if (fd.tell() + len(chunk)) > maxfilesize * 1.1:
                    fd, writer, iteration = rotate_avro_file(fd,
                                                             writer,
                                                             iteration,
                                                             fileprefix,
                                                             destdir,
                                                             datum,
                                                             schema)
                file = makefile(os.path.basename(os.path.normpath(f)),
                                os.path.relpath(f, basedir),
                                numsiblings,
                                sibling,
                                chunk)
                writer.append(file)
                writer.flush()
                del file

        for f in all_files:
            os.remove(f)

        for d in all_dirs:
            os.rmdir(d)

    finally:
        writer.close()
        fd.close()
Esempio n. 39
0
def read_log(topic, log):
    schema = avro.schema.parse(open(os.path.abspath(os.path.dirname(__file__)) + "/avro_schema/" + topic + ".avsc").read())
    print "schema:", schema
    writer = DataFileWriter(open(os.path.abspath(os.path.dirname(__file__)) + topic + ".avro", "w"), DatumWriter(), schema)
    for i in range(5):
        writer.append(log)
    writer.close()
    reader = DataFileReader(open(os.path.abspath(os.path.dirname(__file__)) + topic + ".avro", "r"), DatumReader())
    for log in reader:
        print log
Esempio n. 40
0
def objToBin2():
    file = io.BytesIO()
    datum_writer = DatumWriter()
    fwriter = DataFileWriter(file, datum_writer, sc)
    for d in datum:
        fwriter.append(d)
    ab = file.getvalue()
    fwriter.close()

    return ab
Esempio n. 41
0
def write_json_to_avro(schema_uri, output_uri, json_str):

    schema = avro.schema.parse(open(schema_uri).read())
    writer = DataFileWriter(open(output_uri, "w"), DatumWriter(), schema)
    json_list = json.loads(json_str)

    for row in json_list:
        writer.append(row)

    writer.close()
Esempio n. 42
0
def main():

    if len(sys.argv) < 3:
        print "Usage:", sys.argv[0]
        print "add [num of events to add] filename"
        print "list filename"
        exit(1)

    command = sys.argv[1]

    if command == 'add':

        noEvents = sys.argv[2]
        filename = sys.argv[3]

        # load existing events

        existingEvents = {}

        try:
            reader = DataFileReader(open(filename, "rb"), DatumReader())
            existingEvents = reader
            reader.close()
        except IOError:
            print filename + ": Could not open file.  Creating a new one."

        # Write back out to disk

        try:

            schema = avro.schema.parse(open("etc/userevent.avsc").read())

            f = open(filename, "w")
            writer = DataFileWriter(f, DatumWriter(), schema)

            # Append new user events

            for i in range(0, int(noEvents)):
                newEvent = createUserEvent()
                print newEvent
                writer.append(newEvent)

            writer.close()

            print "Wrote {0} user events".format(noEvents)
        except IOError:
            print filename + ": Could not save file."

    elif command == 'list':

        listAllUserEvents(sys.argv[2])

    else:
        print "Unregistered command. Exiting"
        sys.exit(1)
Esempio n. 43
0
def main():
    parser = ArgumentParser(description="Simple AMS example of subscription pull/consume")
    parser.add_argument('--host', type=str, default='messaging-devel.argo.grnet.gr', help='FQDN of AMS Service')
    parser.add_argument('--token', type=str, required=True, help='Given token')
    parser.add_argument('--project', type=str, required=True, help='Project  registered in AMS Service')
    parser.add_argument('--subscription', type=str, required=True, help='Subscription name')
    parser.add_argument('--topic', type=str, required=True, help='Given topic')
    parser.add_argument('--nummsgs', type=int, default=3, help='Number of messages to pull and ack')
    parser.add_argument('--schema', type=str, required=True, help='Avro schema')
    parser.add_argument('--outfile', type=str, required=True, help='Output avro file')
    args = parser.parse_args()

    # initialize service with given token and project
    ams = ArgoMessagingService(endpoint=args.host, token=args.token, project=args.project)

    # ensure that subscription is created in first run. messages can be
    # pulled from the subscription only when subscription already exists
    # for given topic prior messages being published to topic
    try:
        if not ams.has_sub(args.subscription):
            ams.create_sub(args.subscription, args.topic)
        subscription = ams.get_sub(args.subscription, retobj=True)
    except AmsException as e:
        print(e)
        raise SystemExit(1)

    # try to pull number of messages from subscription. method will
    # return (ackIds, AmsMessage) tuples from which ackIds and messages
    # payload will be extracted.
    avro_payloads = list()
    for msg in subscription.pullack(args.nummsgs, retry=5, retrysleep=15, return_immediately=True):
        data = msg.get_data()
        msgid = msg.get_msgid()
        print('msgid={0}'.format(msgid))
        avro_payloads.append(data)

    try:
        schema = load_schema(args.schema)
        if os.path.exists(args.outfile):
            avroFile = open(args.outfile, 'a+')
            writer = DataFileWriter(avroFile, DatumWriter())
        else:
            avroFile = open(args.outfile, 'w+')
            writer = DataFileWriter(avroFile, DatumWriter(), schema)

        for am in avro_payloads:
            msg = avro_deserialize(am, args.schema)
            writer.append(msg)

        writer.close()
        avroFile.close()

    except Exception as e:
        print(e)
        raise SystemExit(1)
Esempio n. 44
0
def write(fin, fout, schema):
    "write json to avro"
    schema = avro.schema.parse(open(schema).read())
    data = json.load(open(fin, 'r'))
    writer = DataFileWriter(open(fout, "w"), DatumWriter(), schema)
    if  isinstance(data, list):
        for doc in data:
            writer.append(doc)
    else:
        writer.append(data)
    writer.close()
Esempio n. 45
0
    def _write_to_avro(self, log, fields):
        msglist = []
        msg, tags = {}, {}

        msg = {'service': fields['serviceType'],
               'timestamp': fields['timestamp'],
               'hostname': fields['hostName'],
               'metric': fields['metricName'],
               'status': fields['metricStatus']}
        msgattrmap = {'detailsData': 'message',
                      'summaryData': 'summary',
                      'nagios_host': 'monitoring_host'}
        for attr in msgattrmap.keys():
            if attr in fields:
                msg[msgattrmap[attr]] = fields[attr]

        tagattrmap = {'ROC': 'roc', 'voName': 'voName', 'voFqan': 'voFqan'}
        for attr in tagattrmap.keys():
            tags[tagattrmap[attr]] = fields.get(attr, None)
        if tags:
            msg['tags'] = tags

        if ',' in fields['serviceType']:
            servtype = fields['serviceType'].split(',')
            msg['service'] = servtype[0].strip()
            msglist.append(msg)
            copymsg = msg.copy()
            copymsg['service'] = servtype[1].strip()
            msglist.append(copymsg)
        else:
            msglist.append(msg)

        sh.thlock.acquire(True)
        try:
            schema = avro.schema.parse(open(self.avroSchema).read())
            if path.exists(log):
                avroFile = open(log, 'a+')
                writer = DataFileWriter(avroFile, DatumWriter())
            else:
                avroFile = open(log, 'w+')
                writer = DataFileWriter(avroFile, DatumWriter(), schema)

            for m in msglist:
                writer.append(m)

            writer.close()
            avroFile.close()

        except (IOError, OSError) as e:
            sh.Logger.error(e)
            raise SystemExit(1)

        finally:
            sh.thlock.release()
Esempio n. 46
0
def testWrite(filename, schema):
    fd = open(filename, 'wb')

    datum = DatumWriter()
    writer = DataFileWriter(fd, datum, schema)

    writer.append(makeObject("Person A", 23))
    writer.append(makeObject("Person B", 31))
    writer.append(makeObject("Person C", 28))

    writer.close()
Esempio n. 47
0
def main(schema_fn, csv_fn, avro_fn):

    with open(schema_fn) as f_in:
        schema = avro.schema.parse(f_in.read())

    with open(csv_fn) as f_in:
        reader = csv.reader(f_in, delimiter=';')
        with open(avro_fn, 'wb') as f_out:
            writer = DataFileWriter(f_out, DatumWriter(), schema)
            for row in reader:
                writer.append(dict(zip(FIELDS, row)))
            writer.close()
Esempio n. 48
0
  def _write_data(self,
                  directory=None,
                  prefix=tempfile.template,
                  codec='null',
                  count=len(RECORDS)):

    with tempfile.NamedTemporaryFile(
        delete=False, dir=directory, prefix=prefix) as f:
      writer = DataFileWriter(f, DatumWriter(), self.SCHEMA, codec=codec)
      len_records = len(self.RECORDS)
      for i in range(count):
        writer.append(self.RECORDS[i % len_records])
      writer.close()

      self._temp_files.append(f.name)
      return f.name
Esempio n. 49
0
class AvroWriter(RecordWriter):

    schema = None

    def __init__(self, context):
        super(AvroWriter, self).__init__(context)
        job_conf = context.job_conf
        part = int(job_conf['mapreduce.task.partition'])
        outdir = job_conf["mapreduce.task.output.dir"]
        outfn = "%s/part-r-%05d.avro" % (outdir, part)
        wh = hdfs.open(outfn, "w")
        self.writer = DataFileWriter(wh, DatumWriter(), self.schema)

    def close(self):
        self.writer.close()
        # FIXME do we really need to explicitly close the filesystem?
        self.writer.writer.fs.close()
class AvroAppender(threading.Thread):
    def __init__(self, file):
        threading.Thread.__init__(self)
        self.avro_writer = DataFileWriter(open(file, "w"), DatumWriter(), schema)
        self.queue = Queue.Queue()
        self.should_stop = False
        self.mutex = threading.Lock()
        self.start()


    def log_append(self, user, advertiser, **kwargs):
        if user is not None and advertiser is not None:
            record = dict(user=user, advertiser=advertiser)
            if kwargs["ip"]:
                record["ip"] = kwargs["ip"]
            if kwargs["agent"]:
                record["agent"] = kwargs["agent"]
            if kwargs["time"]:
                record["timestamp"] = float(kwargs["time"])
            else:
                record["timestamp"] = float(time.time())
            if kwargs["keywords"]:
                record["keywords"] = list(set([string.strip() for string in kwargs["keywords"].split(",")]))
            self.queue.put_nowait(record)
        else:
            print "Missing user"


    def close_appender(self):
        self.mutex.acquire()
        self.should_stop = True
        self.mutex.release()

    def run(self):
        while True:
            try:
                record = self.queue.get(False, 1000)
                self.avro_writer.append(record)
            except Queue.Empty:
                self.mutex.acquire()
                stop = self.should_stop
                self.mutex.release()
                if stop:
                    break
        self.avro_writer.close()
Esempio n. 51
0
    def write(self):
        try:
            schema = avro.schema.parse(open(self.schema).read())
            avrofile = open(self.outfile, 'w+')
            datawrite = DataFileWriter(avrofile, DatumWriter(), schema)

            for elem in self.listdata:
                datawrite.append(elem)

            datawrite.close()
            avrofile.close()

        except (avro.schema.SchemaParseException, avro.io.AvroTypeException):
            self.logger.error(" couldn't parse %s" % self.schema)
            raise SystemExit(1)
        except IOError as e:
            self.logger.error(e)
            raise SystemExit(1)
Esempio n. 52
0
def main():
  """Start of execution"""
  #combine the schemas 
  known_schemas = avro.schema.Names()
  types_schema = LoadAvsc("parameter_types.avsc", known_schemas)
  param_schema = LoadAvsc("parameter.avsc", known_schemas)
  print json.dumps(param_schema.to_json(avro.schema.Names()), indent=2) 
  #test the schema works 
  param_file = open("parameters.avro", "w")
  writer = DataFileWriter(param_file, DatumWriter(), param_schema)
  param_1 = {"name": "test", "description":"An Avro test.", "type":"int"}
  param_2 = {"name": "test", "description":"An Avro test.", "type":"boolean"}
  writer.append(param_1)
  writer.append(param_2)
  writer.close()
  reader = DataFileReader(open("parameters.avro", "r"), DatumReader())
  for parameter in reader:
      print parameter
  reader.close()  
def readAndWriteAvro():
    """ Unlike java, avro does not let you generate
        code for Tweet in python. So only way to read and write
        data is without using code generation"""

    #Read the schema
    schema = avro.schema.parse(open("tweet.avsc").read())


    #write some data
    writer = DataFileWriter(open("tweets.avro", "w"), DatumWriter(), schema)
    writer.append({"tweetId": 5, "user": "******", "text" : "Tweeting from python as well"})
    writer.close()

    #read the same data
    tweets = DataFileReader(open("tweets.avro", "r"), DatumReader())
    for tweet in tweets:
        print tweet
    tweets.close()
def main(argv):
    try:
        schema_fn = argv[1]
        n_users = int(argv[2])
        avro_fn = argv[3]
    except IndexError:
        sys.exit('Usage: %s SCHEMA_FILE N_USERS AVRO_FILE' % argv[0])
    with open(schema_fn) as f_in:
        schema = avro.schema.parse(f_in.read())
    with open(avro_fn, 'wb') as f_out:
        writer = DataFileWriter(f_out, DatumWriter(), schema)
        for i in xrange(n_users):
            writer.append({
                'name': random.choice(NAME_POOL),
                'office': random.choice(OFFICE_POOL),
                'favorite_color': random.choice(COLOR_POOL),
                'favorite_number': i,
            })
        writer.close()
Esempio n. 55
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-s', nargs=1, help='new schema', required=True, metavar='avro schema')
    parser.add_argument('-i', nargs='+', help='avro files', required=True, metavar='avro file')
    parser.add_argument('-ts', action='store_true', help='convert int tag values to str', required=False)
    parser.add_argument('-o', nargs=1, help='output directory', required=True, metavar='output directory')
    args = parser.parse_args()

    for f in args.i:
        out = []

        if args.o[0].startswith('/'):
            dest = args.o[0]
        else:
            dest = os.path.abspath('.') + '/' + args.o[0]

        try:
            os.makedirs(dest)
        except OSError as e:
            if e.args[0] != errno.EEXIST:
                print os.strerror(e.args[0]), e.args[1], args.o[0]
                raise SystemExit(1)

        schema = avro.schema.parse(open(args.s[0]).read())
        writer = DataFileWriter(open(dest + '/' + os.path.basename(f), 'w'), DatumWriter(), schema)
        reader = DataFileReader(open(f, 'r'), DatumReader())

        try:
            for i, entry in enumerate(reader):
                if args.ts:
                    for t in entry['tags']:
                        if isinstance(entry['tags'][t], int):
                            entry['tags'][t] = str(entry['tags'][t])
                writer.append(entry)

            writer.close()

        except UnicodeDecodeError as e:
            pprint.pprint(e)
            print f
 def _produce_test_input(self):
     schema = avro.schema.parse("""
     {
       "type":"record",
       "name":"TrackEntity2",
       "namespace":"com.spotify.entity.schema",
       "doc":"Track entity merged from various sources",
       "fields":[
         {
           "name":"map_record",
           "type":{
             "type":"map",
             "values":{
               "type":"record",
               "name":"MapNestedRecordObj",
               "doc":"Nested Record in a map doc",
               "fields":[
                 {
                   "name":"element1",
                   "type":"string",
                   "doc":"element 1 doc"
                 },
                 {
                   "name":"element2",
                   "type":[
                     "null",
                     "string"
                   ],
                   "doc":"element 2 doc"
                 }
               ]
             }
           },
           "doc":"doc for map"
         },
         {
           "name":"additional",
           "type":{
             "type":"map",
             "values":"string"
           },
           "doc":"doc for second map record"
         },
         {
           "name":"track_gid",
           "type":"string",
           "doc":"Track GID in hexadecimal string"
         },
         {
           "name":"track_uri",
           "type":"string",
           "doc":"Track URI in base62 string"
         },
         {
           "name":"Suit",
           "type":{
             "type":"enum",
             "name":"Suit",
             "doc":"enum documentation broz",
             "symbols":[
               "SPADES",
               "HEARTS",
               "DIAMONDS",
               "CLUBS"
             ]
           }
         },
         {
           "name":"FakeRecord",
           "type":{
             "type":"record",
             "name":"FakeRecord",
             "namespace":"com.spotify.data.types.coolType",
             "doc":"My Fake Record doc",
             "fields":[
               {
                 "name":"coolName",
                 "type":"string",
                 "doc":"Cool Name doc"
               }
             ]
           }
         },
         {
           "name":"master_metadata",
           "type":[
             "null",
             {
               "type":"record",
               "name":"MasterMetadata",
               "namespace":"com.spotify.data.types.metadata",
               "doc":"metadoc",
               "fields":[
                 {
                   "name":"track",
                   "type":[
                     "null",
                     {
                       "type":"record",
                       "name":"Track",
                       "doc":"Sqoop import of track",
                       "fields":[
                         {
                           "name":"id",
                           "type":[
                             "null",
                             "int"
                           ],
                           "doc":"id description field",
                           "default":null,
                           "columnName":"id",
                           "sqlType":"4"
                         },
                         {
                           "name":"name",
                           "type":[
                             "null",
                             "string"
                           ],
                           "doc":"name description field",
                           "default":null,
                           "columnName":"name",
                           "sqlType":"12"
                         }
                       ],
                       "tableName":"track"
                     }
                   ],
                   "default":null
                 }
               ]
             }
           ]
         },
         {
           "name":"children",
           "type":{
             "type":"array",
             "items":{
               "type":"record",
               "name":"Child",
               "doc":"array of children documentation",
               "fields":[
                 {
                   "name":"name",
                   "type":"string",
                   "doc":"my specific child\'s doc"
                 }
               ]
             }
           }
         }
       ]
     }""")
     self.addCleanup(os.remove, "tmp.avro")
     writer = DataFileWriter(open("tmp.avro", "wb"), DatumWriter(), schema)
     writer.append({
         u'track_gid': u'Cool guid',
         u'map_record': {
             u'Cool key': {
                 u'element1': u'element 1 data',
                 u'element2': u'element 2 data'
             }
         },
         u'additional': {
             u'key1': u'value1'
         }, u'master_metadata': {
             u'track': {
                 u'id': 1,
                 u'name': u'Cool Track Name'
             }
         }, u'track_uri': u'Totally a url here',
         u'FakeRecord': {
             u'coolName': u'Cool Fake Record Name'
         },
         u'Suit': u'DIAMONDS',
         u'children': [
             {
                 u'name': u'Bob'
             },
             {
                 u'name': u'Joe'
             }
         ]
     })
     writer.close()
     self.gcs_client.put("tmp.avro", self.gcs_dir_url + "/tmp.avro")
Esempio n. 57
0
def main():
   known_schemas = avro.schema.Names()

   with open("point.avsc", "rb") as fp:
      point = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas)

   with open("review.avsc", "rb") as fp:
      place = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas)

   with open("place.avsc", "rb") as fp:
      place = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas)

   output = StringIO.StringIO()
   writer = DataFileWriter(output, DatumWriter(), point)
   writer.append({'x': 1.5, 'y': 2.75})
   writer.flush()
   serialized = output.getvalue()
   reader = DataFileReader(StringIO.StringIO(serialized), DatumReader())
   deserialized = tuple(reader)[0]
   assert deserialized['x'] == 1.5
   assert deserialized['y'] == 2.75
   reader.close()
   writer.close()

   try:
      output = StringIO.StringIO()
      writer = DataFileWriter(output, DatumWriter(), point)
      writer.append({'x': 1.5})
      assert False
   except AvroTypeException as e:
      pass

   try:
      output = StringIO.StringIO()
      writer = DataFileWriter(output, DatumWriter(), point)
      writer.append({'x': 1.5, 'y': "wtanaka.com"})
      assert False
   except AvroTypeException as e:
      pass

   output = StringIO.StringIO()
   writer = DataFileWriter(output, DatumWriter(), place)
   writer.append({
         'name': 'wtanaka.com',
         'location': {'x': 1.5, 'y': 2.75}
         })
   writer.flush()
   serialized = output.getvalue()
   reader = DataFileReader(StringIO.StringIO(serialized), DatumReader())
   deserialized = tuple(reader)[0]
   assert deserialized['location']['x'] == 1.5
   assert deserialized['location']['y'] == 2.75
   reader.close()
   writer.close()

   output = StringIO.StringIO()
   writer = DataFileWriter(output, DatumWriter(), place)
   writer.append({
         'name': 'wtanaka.com',
         'location': {'x': 1.5, 'y': 2.75},
         'review': {'rating': 4, 'text': '4 stars would come again'},
         })
   writer.flush()
   serialized = output.getvalue()
   reader = DataFileReader(StringIO.StringIO(serialized), DatumReader())
   deserialized = tuple(reader)[0]
   assert deserialized['location']['x'] == 1.5
   assert deserialized['location']['y'] == 2.75
   reader.close()
   writer.close()

   try:
      output = StringIO.StringIO()
      writer = DataFileWriter(output, DatumWriter(), place)
      writer.append({
            'name': 'wtanaka.com',
            'location': {'x': 1.5, 'y': 2.75},
            'review': {'x': 1.5, 'y': 2.75},
            })
      assert False
   except AvroTypeException as e:
      pass
Esempio n. 58
0
            'Cc': cleanCc,
            'Date': message['Date'],
            'Subject': message['Subject'],
            'Body': get_body(message)
            })
        print(cleanFrom)

schema = avro.schema.Parse(open("email.avro.schema").read())
writer = DataFileWriter(open("email.avro", "wb"), DatumWriter(), schema)
   
pathToEmails  = '../emails/Archives'
pathToCleanup = '../emails/name_to_address.csv'

mboxfiles = [os.path.join(dirpath, f)
	     for dirpath, dirnames, files in os.walk(pathToEmails)
	     for f in files if f.endswith('mbox')]
mailTable = []
#print(mboxfiles)

for mboxfile in mboxfiles:
#   print(mboxfile)
    write_avro(mboxfile, writer, pathToCleanup)   

writer.close()

#reader = DataFileReader(open("email.avro", "rb"), DatumReader())
#for email in reader:
#   print(email['Subject'])
#   reader.close()

Esempio n. 59
0
def writeFile():
    writer = DataFileWriter(open("part-00000.avro", "w"), DatumWriter(), schema)
    writer.append({"logline": "2016\t30"})
    writer.close()