예제 #1
0
def print_all_events(path, limit=10):
    """example stepping through all the data files and parsing them

    1. iterate through all data files
    2. open files with avro
    3. parse event JSON
    4. pretty-print events
    """
    printed = 0
    for parent, dirs, files in os.walk(path):
        for fname in sorted(files):
            printed += 1
            if printed >= limit:
                print("...")
                return
            with open(os.path.join(parent, fname), 'rb') as f:
                # this is how you open an avro file
                reader = DataFileReader(f, DatumReader())
                # an avro file provides an iterable of events
                for reading in reader:
                    # the uuid we want to use is reading.SystemProperties.connectionDeviceId
                    print(
                        f"uuid={reading['SystemProperties']['connectionDeviceId']}"
                    )

                    # the actual payload from the app is the json body (as a bytestring)
                    try:
                        # parse it out so it looks nicer when we print:
                        reading['Body'] = json.loads(
                            reading['Body'].decode('utf8'))
                    except ValueError:
                        # leave not json as bytes. This shouldn't happen!
                        pass
                    pprint.pprint(reading)
예제 #2
0
    def runEngine(self, engine):
        if engine.config.method == "emit":
            engine.emit = lambda x: x

        for record in DataFileReader(
                open("test/prettypfa/exoplanets.avro", "r"), DatumReader()):
            engine.action(record)
예제 #3
0
def lambda_handler(event, context):
    source_bucket = event['Records'][0]['s3']['bucket']['name']
    key = urllib.parse.unquote(event['Records'][0]['s3']['object']['key'])
    splitStr = source_bucket.split(".")
    account = splitStr[0]
    profile = splitStr[1]

    processed_bucket = source_bucket + "-processed"
    processed_key = key

    stream = get_object(source_bucket, key)
    success = copy_object(source_bucket, key, processed_bucket, processed_key)

    if success:
        s3.delete_object(Bucket=source_bucket, Key=key)

    if stream is not None:

        raw_bytes = stream.read()
        avro_bytes = io.BytesIO(raw_bytes)

        reader = DataFileReader(avro_bytes, DatumReader())
        for line in reader:
            send_to_tealium(line, account, profile)

    return ""
예제 #4
0
파일: avro.py 프로젝트: ohemelaar/tonga
    def decode(self, encoded_event: Any) -> Dict[str, Union[BaseModel, BaseStoreRecord,
                                                            BaseHandler, BaseStoreRecordHandler]]:
        try:
            reader = DataFileReader(BytesIO(encoded_event), DatumReader())
            schema = json.loads(reader.meta.get('avro.schema').decode('utf-8'))
            schema_name = schema['namespace'] + '.' + schema['name']
            event_data = next(reader)
        except AvroTypeException as err:
            self.logger.exception(f'{err.__str__()}')
            raise AvroDecodeError

        # Finds a matching event name
        for e_name, event in self._events.items():
            if e_name.match(schema_name):  # type: ignore
                event_class = event
                break
        else:
            raise MissingEventClass

        # Finds a matching handler name
        for e_name, handler in self._handlers.items():
            if e_name.match(schema_name):  # type: ignore
                handler_class = handler
                break
        else:
            raise MissingHandlerClass
        return {'event_class': event_class.from_data(event_data=event_data), 'handler_class': handler_class}
def build_cars_in_time(files):
    cars = {}

    for idx, file in enumerate(files):
        print_progress(idx, len(files))

        try:
            reader = DataFileReader(open(file, "rb"), DatumReader())
            for car in reader:
                car_reg_number = select_reg_number(car)
                if not cars.has_key(car_reg_number):
                    cars[car_reg_number] = []
                cars[car_reg_number].append({
                    'timestamp':
                    select_timestamp(car),
                    'distanceAccumulated':
                    select_distance_accumulated(car),
                    'regNumber':
                    select_reg_number(car)
                })
        except TypeError:
            print("Error reading file {0}".format(file))
        finally:
            reader.close()

    return cars
def handle_file(path):
    print("Reading file from: " + path)
    reader = DataFileReader(open(path, "rb"), DatumReader())
    for record in reader:
        cset = CollectionSet()
        cset.ParseFromString(record['Body'])
        handle_collection_set(cset)
예제 #7
0
  def read(self, format):
    time_start = time.time()

    if format == 'json':
      with open('./output/output.json') as file:
        json.loads(file.read())

    if format == 'jsch':
      with open('./output/output.json') as file:
        validate(json.loads(file.read()), self._schema_json)

    elif format == 'avro':
      reader = DataFileReader(open('./output/output.avro', 'r'), DatumReader())
      for user in reader:
        pass
      reader.close()

    elif format == 'protobuf':
      with open('./output/output.pb', 'rb') as file:
          addressbook_pb2.AddressBook().ParseFromString(file.read())

    elif format == 'gzjson':
      with gzip.open('./output/output.jsz', 'rb') as file: 
          json.loads(file.read())

    time_end = time.time()

    return time_end - time_start
예제 #8
0
    def get_data_in_batches(
        self,
        bucket_name,
        prefix=None,
        data_after=None,
        data_until=None,
        batch_size=10000,
    ):
        rows = []
        for blob in self.generate_blob_list(bucket_name, prefix, data_after,
                                            data_until):
            # download file content as bytes, read via avro
            blob_meta = {
                "blob_name": blob.name,
                "blob_modified_at": blob.updated,
            }
            bytes_data = blob.download_as_string()
            bytes_object = BytesIO(bytes_data)
            bytes_object.mode = "rb+"  # need to "fake" the mode attribute because
            # avro checks the mode of the file given for some reason, fails otherwise
            reader = DataFileReader(bytes_object, DatumReader())
            for row in reader:
                # add blob-level metadata
                row.update(blob_meta)
                rows.append(row)
            if len(rows) >= batch_size:
                yield rows
                rows = []

        if rows:
            # return any data that was left after the last iteration
            yield rows
예제 #9
0
파일: test_avro.py 프로젝트: zjureel/flink
 def _read_avro_file(self) -> List[dict]:
     records = []
     for file in glob.glob(
             os.path.join(os.path.join(self.avro_dir_name, '**/*'))):
         for record in DataFileReader(open(file, 'rb'), DatumReader()):
             records.append(record)
     return records
예제 #10
0
def read_then_to_json(client, file_names, bucket, error_keys_table):
    temp_json_output = []


    for file in file_names:
        filename = "/tmp/temp.avro"
        try:
            client.download_file(Bucket = bucket, Key = file, Filename = filename)
        except Exception as e:
            ''' files which could not be downloaded'''
            print ("File could not be downloaded: " + file)
            error_keys_table['aws']['files'].append(file)
            continue

        try:
            reader = DataFileReader(open(filename , "rb"), DatumReader())

        except Exception as e:
            ''' files that couldn't be opened '''
            print ("File could not be opened: " + file)
            error_keys_table['open']['files'].append(file)
            continue

        for user in reader:
            if user not in temp_json_output:
                temp_json_output.append(user)
    return temp_json_output
    def _from_avro_generic(avro_container_uri: str, ):
        datum_counter = 0
        datum_to_return = None
        # DET TODO add other exception handling around the double with clause
        with open(avro_container_uri, "rb") as avro_fp:
            with DataFileReader(avro_fp, DatumReader()) as reader:
                #
                #  This static meethod can only initialize one datum in the file - scan through and raise
                #  error if more than one found
                #  Not sure if there is lazy access to the datum - if so returning the datum to caller
                #  for subsequent loading would be problematic
                #
                for datum_counter, datum in enumerate(reader, start=1):
                    print('Reading datum #' + str(datum_counter))
                    print('The message datum = ' + str(datum))
                    if datum_counter == 1:
                        datum_to_return = datum

        if datum_counter > 1:
            raise EmeraldMessageDeserializationError(
                'Unable to deserialize from AVRO container "' +
                avro_container_uri +
                '" - this deserializer can only have one datum per file' +
                os.linesep + 'Total element count in this file = ' +
                str(datum_counter))

        if datum_to_return is None:
            raise EmeraldMessageDeserializationError(
                'Data could not be loaded from AVRO file "' +
                str(avro_container_uri) + '" using schema ' +
                AbstractContainer.get_avro_schema_record().avro_schema_name)

        print('Length of datum to return = ' + str(datum_to_return))
        print('Type of data  to return = ' + str(type(datum_to_return)))
        return datum_to_return
def main(args):
    log = logging.getLogger(__name__)
    log.setLevel(logging.INFO)

    sys_log = logging.handlers.SysLogHandler("/dev/log")
    sys_format = logging.Formatter(
        '%(name)s[%(process)d]: %(levelname)s %(message)s')
    sys_log.setFormatter(sys_format)

    log.addHandler(sys_log)

    reader = DataFileReader(open(args.avro_file, "r"), DatumReader())

    schema = reader.datum_reader.writers_schema

    for i, row in enumerate(reader):
        log.debug("Consumer row:" + str(row))
        writer = DatumWriter(schema)
        bytes_writer = io.BytesIO()
        encoder = BinaryEncoder(bytes_writer)
        writer.write(row, encoder)
        raw_bytes = bytes_writer.getvalue()
        b64enc = base64.b64encode(raw_bytes)
        msg = {"messages": [{"data": b64enc}]}

        json_str = json.dumps(msg)
        log.debug("json msg:" + json_str)
        publish(json_str, args.ams_endpoint, args.ams_project, args.ams_topic,
                args.ams_key, log)
예제 #13
0
    def generic_dataframe(self, df, avro_schema, assert_fns=None):
        """Generic test running function for arbitrary avro schemas.

        Writes a dataframe containing the records to avro.

        Reads back and compares with the original
        """
        print(avro_schema)

        cyavro.write_avro_file_from_dataframe(df,
                                              self.filename,
                                              json.dumps(avro_schema),
                                              codec='null')

        if assert_fns is None:
            assert_fns = {}

        df_read = cyavro.read_avro_file_as_dataframe(self.filename)

        import avro.schema
        from avro.datafile import DataFileReader, DataFileWriter
        from avro.io import DatumReader, DatumWriter

        with open(self.filename, 'rb') as fo:
            reader = DataFileReader(fo, DatumReader())
            records = []
            for user in reader:
                records.append(user)
            df_reference = pd.DataFrame(records)
            reader.close()

        success = True

        for col in avro_schema["fields"]:
            colname = col['name']
            assert_fn = assert_fns.get(colname, np.testing.assert_array_equal)

            def print_fail_header(s):
                print('#' * len(s))
                print("FAIL: Column {}".format(col))
                print('#' * len(s))
                print(s)

            try:
                assert_fn(df_read[colname], df[colname])
            except AssertionError:
                print_fail_header(
                    "Failed for cyavro read comparison  {}\n".format(col))
                traceback.print_exc(file=sys.stdout)
                success = False

            try:
                assert_fn(df_reference[colname], df[colname])
            except AssertionError:
                print_fail_header(
                    "Failed for cyavro write comparison {}\n".format(col))
                traceback.print_exc(file=sys.stdout)
                success = False

        assert success
예제 #14
0
    def get_schema(self):
        """Lazy accessor for data store schema

        If schema is given as a run parameter, then returns this schema.
        Otherwise extracts the schema from the Avro data store files.
        """
        if self._schema:
            return self._schema
        else:
            if not self._schema_path:  #if there is no schema
                paths = self.__get_paths_to_avro_files()
                with DataFileReader(
                        paths[0].open("r"),
                        _FieldsOrderPreservingDatumReader()) as reader:
                    self._schema = avro.schema.parse(
                        reader.get_meta('avro.schema'))
                    return self._schema
            else:  #a schema is given
                try:
                    self._schema = avro.schema.parse(
                        self._schema_path.open("r").read())
                    return self._schema
                except TypeError:
                    error("supplied schema cannot be parsed!")
                    raise
예제 #15
0
def get_flowrecords_from_flowdata_file(filename_path_input):
    """
    Create a Python generator to read the csv/txt/avro file returning the records to processing.
    *Important: when considering CSV/TXT files remember to use files without header/statistics as input files*
    :param filename_path_input: exported csv/txt/avro flow input file from the original nfpcap file via NFDUMP
    :return: generator to records from file
    """
    if filename_path_input.lower().endswith(('.csv', '.txt')):
        with open(filename_path_input) as csvfile:
            reader = csv.reader(csvfile)
            for line in reader:
                yield create_flow_record_from_csv(line)

    # >> default extension Apache AVRO <<
    else:
        # prepare to read binary
        flowsrecords_reader = DataFileReader(open(filename_path_input, "rb"), DatumReader())
        try:
            for flow in flowsrecords_reader:
                yield flow
        except zlib.error as ze:
            print ze.message
            pass
        except IOError as io:
            print io.message
예제 #16
0
def deserializeDataFromFile2Str(inputFile):
    logging.debug("Deserializing file:" + inputFile)
    reader = DataFileReader(open(inputFile, "r"), DatumReader())
    data = ""
    for item in reader:
        data = data + str(item)
    reader.close()
    return data
예제 #17
0
def open_avrofile(fn_input):
    """
    Return an the data file reader to a given AVRO file.
    *note that to open an .avro file is not necessary to inform the schema because it's embedded in the file*
    :param fn_input:
    :return: record reader object
    """
    return DataFileReader(open(fn_input, "rb"), DatumReader())
예제 #18
0
def run(argv=None, save_main_session=True):
    '''Main entry point; defines and runs the wordcount pipeline.'''
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(known_args.input)

    processed_users = (lines | 'splits' >> beam.Map(split_and_lower)
                       | 'noNum' >> beam.Map(no_num_format)
                       | 'formatOut' >> beam.Map(format_output))
    processed_users | 'uniqueUser' >> beam.Distinct(
    ) | 'writeUnique' >> WriteToText(known_args.output,
                                     file_name_suffix='.csv')

    schema = avro.schema.parse(open("user.avsc", "rb").read())
    processed_users | 'avro_write' >> beam.io.avroio.WriteToAvro(
        'output_avro', schema, file_name_suffix='.avro')

    reader = DataFileReader(open("output_avro-00000-of-00001.avro", "rb"),
                            DatumReader())
    for user in reader:
        print user
    reader.close()

    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        empty_lines_filter = MetricsFilter().with_name('empty_lines')
        query_result = result.metrics().query(empty_lines_filter)
    if query_result['counters']:
        empty_lines_counter = query_result['counters'][0]
        logging.info('number of empty lines: %d', empty_lines_counter.result)

    word_lengths_filter = MetricsFilter().with_name('word_len_dist')
    query_result = result.metrics().query(word_lengths_filter)
    if query_result['distributions']:
        word_lengths_dist = query_result['distributions'][0]
        logging.info('average word length: %d', word_lengths_dist.result.mean)
예제 #19
0
def main(event, context):
    for record in event['Records']:
        key = record['s3']['object']['key']
        key = urllib.unquote(key).decode('utf8')
        print(key)

        json_file_key = create_json_s3_path(key)
        print(json_file_key)

        obj = client.get_object(Bucket=BUCKET_NAME, Key=key)
        obj = io.BytesIO(obj['Body'].read())
        reader = DataFileReader(obj, DatumReader())
        schema = reader.datum_reader.writers_schema

        schema = schema.__dict__
        try:
            event = schema['_props']['name'].lower()
        except:
            event = None

        converted_avro_data = [
            dict(row, **add_on_dict(obj=row, event=event)) for row in reader
        ]
        json_data = json.dumps(converted_avro_data, indent=1)

        print(json.dumps(converted_avro_data[:10]))

        try:
            resource.Object(
                BUCKET_NAME,
                json_file_key).put(Body=(bytes(json_data.encode('UTF-8'))))
            print("Objects Uploaded to S3")
        except Exception as e:
            print("Objects failed to send to s3")
            print(e)
        '''
        BATCH RECORDS IN 500'S AND JSON.DUMPS EACH ROW
        '''
        if event in [
                'uninstall', 'conversion', 'click', 'impression', 'bounce',
                'open', 'send'
        ]:
            try:
                firehose_records = create_payload(converted_avro_data)
                firehose_records = list(divide_chunks(firehose_records, 500))

                for chunk in firehose_records:
                    response = firehost_client.put_record_batch(
                        DeliveryStreamName=STREAM_NAME, Records=chunk)

                print(
                    "Objects sent to Firehose stream: {0}".format(STREAM_NAME))
                print('{0} firehose records failed'.format(
                    response['FailedPutCount']))

            except Exception as e:
                print("Objects failed to send to Firehose")
                print(e)
예제 #20
0
def deserialize_records(record) -> list:
    #print(f"DESRECORD {record}")
    with io.BytesIO(record) as buf:
        reader = DataFileReader(buf, DatumReader())
        msgs = [msg for msg in reader]
        #print(f'WHAT? {msgs}')
        reader.close()

        return msgs
예제 #21
0
def binToObjSChema(ab):
    datum = io.BytesIO(ab)
    reader = DataFileReader(datum, DatumReader())
    cschema = reader.GetMeta('avro.schema')
    print(cschema)
    for user in reader:
        print(user)

    reader.close()
예제 #22
0
파일: log_reader.py 프로젝트: namesuqi/zeus
def read_log(topic, log):
    schema = avro.schema.parse(open(os.path.abspath(os.path.dirname(__file__)) + "/avro_schema/" + topic + ".avsc").read())
    print "schema:", schema
    writer = DataFileWriter(open(os.path.abspath(os.path.dirname(__file__)) + topic + ".avro", "w"), DatumWriter(), schema)
    for i in range(5):
        writer.append(log)
    writer.close()
    reader = DataFileReader(open(os.path.abspath(os.path.dirname(__file__)) + topic + ".avro", "r"), DatumReader())
    for log in reader:
        print log
예제 #23
0
def testRead(filename):
    fd = open(filename, 'rb')

    datum = DatumReader()
    reader = DataFileReader(fd, datum)

    for record in reader:
        print record['name'], record['age']

    reader.close()
예제 #24
0
def read_avro(iostream, runs=1):
    times = []
    for _ in range(runs):
        iostream.seek(0)
        start = time.time()
        records = list(DataFileReader(iostream, DatumReader()))
        end = time.time()
        times.append(end - start)
    print(f'... {runs} runs averaged {sum(times) / runs} seconds')
    return records
예제 #25
0
def evaluate_file(fname: str):
    logger.info("Opening file %s", fname)
    reader = DataFileReader(open(fname, "rb"), DatumReader())
    logger.info("Counting lines...")
    i = 0
    for val in reader:
        i += 1
        if i % 1000 == 0:
            logger.debug("Read %d lines", i)
    logger.info("Found %d lines in file", i)
def deserialize(value):
    """Deserialize AVRO encoded binary string and yield records.
    Args:
        value (str): binary string value.
    Yields:
        dict: deserialized record.
    """
    with DataFileReader(io.BytesIO(value), DatumReader()) as reader:
        for record in reader:
            yield record
예제 #27
0
def main(fn, out_fn, avro_mode=''):
    with open(out_fn, 'w') as fo:
        with open(fn, 'rb') as f:
            reader = DataFileReader(f, DatumReader())
            for r in reader:
                if avro_mode.upper() == 'KV':
                    r = r['key']

                fo.write('%s\t%r\n' % (r['office'], r['counts']))
    print('wrote', out_fn)
예제 #28
0
def read_corpus(corpus_path):
    avro_files_path = [
        os.path.join(corpus_path, filename)
        for filename in os.listdir(corpus_path)
        if os.path.splitext(filename)[1] == '.avro'
    ]
    for avro_file in avro_files_path:
        small_corpus = DataFileReader(open(avro_file, 'rb'), DatumReader())
        for article in small_corpus:
            yield article
예제 #29
0
def load_avro(file_name='data.avro'):
    from avro.datafile import DataFileReader
    from avro.io import DatumReader

    path = str(DATA_ROOT / file_name)
    reader = DataFileReader(open(path, "rb"), DatumReader())

    try:
        return list(reader)
    finally:
        reader.close()
예제 #30
0
def read_avro_with_schema(avro_filepath, schema_filepath):
    print("\nfile:{}\nschema:{}".format(avro_filepath, schema_filepath))

    with open(schema_filepath) as f:
        schema = avro.schema.Parse(f.read())

    datum_reader = DatumReader(reader_schema=schema)
    with open(avro_filepath, 'rb') as f:
        with DataFileReader(f, datum_reader) as dfr:
            for record in dfr:
                print(record)