Exemple #1
0
def concat_avro_files(input_paths: list,
                      output_path: str,
                      avro_tools_path: str = None) -> None:
    """Concatenate Avro files using avro-tools jar utility."""

    avro_tools_cli = _get_avro_tools_cli(avro_tools_path)

    first_input_schema = None
    default_subprocess_kwargs = dict(shell=True,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)

    for file_path in input_paths:
        if not fastavro.is_avro(file_path):
            raise ValueError(f'Input {file_path!r} is not an Avro file')

        print(f'Reading schema from {file_path!r}...')
        getmeta = subprocess.run(
            f'{avro_tools_cli} getmeta {file_path} '
            f'--key avro.schema', **default_subprocess_kwargs)

        getmeta.check_returncode()
        if getmeta.stderr:
            print(f'getmeta.stderr:\n{getmeta.stderr.decode()}')
        avro_schema = json.loads(getmeta.stdout.decode())

        if not first_input_schema:
            first_input_schema = avro_schema
        else:
            assert avro_schema == first_input_schema

    print(f'Concatenating Avro files into {output_path!r}...')
    concat = subprocess.run(
        f'{avro_tools_cli} concat {" ".join(input_paths)} '
        f'{output_path}', **default_subprocess_kwargs)
    concat.check_returncode()

    if not os.path.isfile(output_path):
        raise AssertionError(
            f'{output_path!r} was not created '
            f'(avro-tools stdout: {concat.stdout.decode()!r})')

    print(f'Checking schema in output file {output_path!r}...')
    getmeta = subprocess.run(
        f'{avro_tools_cli} getmeta {output_path} '
        f'--key avro.schema', **default_subprocess_kwargs)

    getmeta.check_returncode()
    output_schema = json.loads(getmeta.stdout.decode())

    diffs = list(dictdiffer.diff(output_schema, first_input_schema))
    if diffs:
        print('Differences in output vs input schema:')
        for diff in diffs:
            print(diff)
        assert list(
            dictdiffer.diff(output_schema['fields'],
                            first_input_schema['fields'])) == []
    else:
        print('Input/output schema identical')
Exemple #2
0
    def is_match(cls, file_path, options=None):
        """
        Test the given file to check if the file has valid
        AVRO format or not.
        
        :param file_path: path to the file to be examined
        :type file_path: str
        :param options: avro read options
        :type options: dict
        :return: is file a avro file or not
        :rtype: bool
        """
        if options is None:
            options = dict()

        # get current position of stream
        if data_utils.is_stream_buffer(file_path):
            starting_location = file_path.tell()

        is_valid_avro = fastavro.is_avro(file_path)

        # return to original position in stream
        if data_utils.is_stream_buffer(file_path):
            file_path.seek(starting_location, 0)

        return is_valid_avro
Exemple #3
0
    def _load_avro_files(self, ext_path: str = None):
        """ Load Avro alert data

        Parameters
        ----------
        ext_path: str, optional
            If not None, load explicitly data under `ext_path`.
            Default is None (self.path is used).
        """
        if ext_path is not None:
            path = ext_path
        else:
            path = self.path

        if isinstance(path, list):
            self.filenames = path
        elif os.path.isdir(path):
            self.filenames = glob.glob(os.path.join(path, '*.avro'))
        elif path == '':
            print('WARNING: path to avro files is empty')
            self.filenames = []
        elif fastavro.is_avro(path):
            self.filenames = [path]
        else:
            msg = """
            Data path not understood: {}
            You must give an avro file with
            its extension (.avro), or a folder with avro files.
            """.format(path)
            raise IOError(msg)
Exemple #4
0
def read_avro_alerts(data_path: str) -> Iterable[dict]:
    """ Read avro alert files and return an interable
    with dicts of alert data

    Parameters
    ----------
    data_path: str
        a directory path where to look for avro alert files

    Returns
    ----------
    record: Iterable
        a generator that yields records(dict) after reading avro files
        in the given directory
    """
    avro_files = glob.glob(data_path + '/*.avro')

    for avro_file in avro_files:
        # check for valid avro file
        if not fastavro.is_avro(avro_file):
            continue

        with open(avro_file, 'rb') as f:
            reader = fastavro.reader(f)
            record = next(reader)

        yield record
Exemple #5
0
def get_dataframe_from_avro(input_path: str) -> pandas.DataFrame:
    """Create a DataFrame from Avro file (in-memory, mind your sizes)."""

    if not fastavro.is_avro(input_path):
        raise ValueError(f'Input {input_path!r} is not an Avro file')

    with open(input_path, 'rb') as avro_file:
        avro_reader = fastavro.reader(avro_file)
        df = pandas.DataFrame.from_records(list(avro_reader))

    return df
Exemple #6
0
    def is_match(cls, file_path, options=None):
        """
        Test the given file to check if the file has valid
        AVRO format or not.
        
        :param file_path: path to the file to be examined
        :type file_path: str
        :param options: avro read options
        :type options: dict
        :return: is file a avro file or not
        :rtype: bool
        """

        is_valid_avro = fastavro.is_avro(file_path)
        return is_valid_avro
Exemple #7
0
def sample_avro_file(input_paths: list,
                     output_path: str,
                     limit: int,
                     sample_rate: float = 0.5,
                     avro_tools_path: str = None) -> None:
    """Sample records from an Avro file using avro-tools jar utility."""

    avro_tools_cli = _get_avro_tools_cli(avro_tools_path)

    default_subprocess_kwargs = dict(shell=True,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)

    for input_path in input_paths:
        if not fastavro.is_avro(input_path):
            raise ValueError(f'Input {input_path!r} is not an Avro file')

    sample_cmd = (f'{avro_tools_cli} cat --limit {limit} '
                  f'--samplerate {sample_rate} {" ".join(input_paths)} '
                  f'{output_path}')
    print(f'Sampling: {sample_cmd!r}...')
    cat = subprocess.run(sample_cmd, **default_subprocess_kwargs)
    cat.check_returncode()
    print(f'Result: {cat.stdout.decode()!r}')
Exemple #8
0
def test_is_avro_fo():
    for path in iglob('%s/*.avro' % data_dir):
        with open(path, 'rb') as fp:
            assert fastavro.is_avro(fp)
    with open(__file__, 'rb') as fp:
        assert not fastavro.is_avro(fp)
Exemple #9
0
def test_is_avro_str():
    for path in iglob('%s/*.avro' % data_dir):
        assert fastavro.is_avro(path)
    assert not fastavro.is_avro(__file__)
def convert(in_path, out_path, dedup_threshold):
    """
    Convert .avro files in in_path into .parquet files in out_path.

    :param in_path: The input path for the .avro files.
    :type in_path: str
    :param out_path: The output path to write .parquet files.
    :type out_path: str
    :param dedup_threshold: The duplication percentage for dictionary compression.
    :type dedup_threshold: float
    :return:
    """
    for avro_file_path in file_crawler.crawl(in_path):
        log.info(f"Opening Avro file {avro_file_path}")
        if not is_avro(str(avro_file_path)):
            log.error(f"error: {avro_file_path} is not an Avro file")
            sys.exit(1)

        with open(avro_file_path, "rb") as open_file:
            avro_data = reader(open_file)
            # Get the ordered list of field names from the avro schema
            avro_file_schema = avro_data.metadata['avro.schema']
            log.debug(f"avro_file_schema: {avro_file_schema}")
            avro_schema = avro_data.writer_schema
            log.debug(f"avro_schema: {avro_schema}")

            # Read Avro file into Pandas dataframe
            data_frame = pd.DataFrame(
                data=avro_data,
                # Preserve column ordering
                columns=[x['name'] for x in avro_schema['fields']])
            log.debug(f"Data Frame info: {data_frame}")
        # Get a list of columns with hashable types
        log.debug(f"All Columns: {[x for x in data_frame.columns]}")
        hashable_cols = [
            x for x in data_frame.columns
            if isinstance(data_frame[x][0], Hashable)
        ]
        log.debug(f"Hashable columns from the data_frame: {hashable_cols}")
        # Find columns with high duplication (> 30%) for use with dictionary encoding
        dupcols = [
            x.encode('UTF-8') for x in hashable_cols
            if (data_frame[x].duplicated().sum() /
                (int(data_frame[x].size) - 1)) > dedup_threshold
        ]
        log.debug(f"Columns to dedup: {dupcols}")
        table = pa.Table.from_pandas(data_frame).replace_schema_metadata({
            'parquet.avro.schema':
            avro_file_schema,
            'writer.model.name':
            'avro'
        })
        parts = avro_file_path.parts
        parquet_file_path = pathlib.Path(os.path.join(out_path, *parts[3:]))
        parquet_file_path.parent.mkdir(parents=True, exist_ok=True)
        parquet_file_path = os.path.splitext(parquet_file_path)[0] + '.parquet'
        log.info(f"Writing parquet file: {parquet_file_path}")
        pq.write_table(table,
                       parquet_file_path,
                       compression='gzip',
                       use_dictionary=dupcols,
                       compression_level=5,
                       coerce_timestamps='ms',
                       allow_truncated_timestamps=False)
from fastavro import reader, is_avro

file_path = "/Users/vpeche/tmp/avro/000000000000.AVRO"
print("File is AVRO: {}".format(is_avro(file_path)))

with open(file_path, 'rb') as fo:
    avro_reader = reader(fo)
    for record in avro_reader:
        print(record)
Exemple #12
0
    def __init__(self, data, schema=None):
        """
        :param data: dict, list of dicts, JSON str, file, bytes
        :param schema: dict 
        """
        self._last_error = None  # Last error captured
        self._object_data = None
        self._json_data = None
        self._avro_data = None
        self._origin = None
        self._schema = None
        self._schema_origin = None

        self._ok = False
        if schema is None:
            self._schema = None
        elif isinstance(schema, str):
            try:
                success, schema, origin = AvroTools.fetch_json(schema)
                if success:
                    schema = json.loads(schema)
                    self._schema_origin = origin
                else:
                    schema = None

            except Exception as e:
                self._last_error = str(e)
                schema = None

        if schema is not None:
            try:
                self._schema = parse_schema(schema)
                if self._schema_origin is None:
                    self._schema_origin = type(schema).__name__
            except Exception as e:
                self._last_error = str(e)
                schema = None

        if isinstance(data, bytes):
            b_avro = False
            try:
                bdata = io.BytesIO(data)
                if is_avro(bdata):
                    self._origin = 'binary_avro'
                    bdata.seek(0)
                    b_avro = True
                    avro_reader = reader(bdata)
                    self._schema = avro_reader.schema
                    obj_data = []
                    for record in avro_reader:
                        obj_data.append(record)
                    self._object_data = None if len(
                        obj_data) == 0 else obj_data[0] if len(
                            obj_data) == 1 else obj_data
                    self._ok = True
                else:
                    self._origin = 'binary_string'
                    data = data.decode('utf-8')

            except Exception as e:
                self._last_error = ('Avro binary' if b_avro else
                                    'String decoding') + f' error: {e}'

        if isinstance(data, str):
            success, json_data, origin = AvroTools.fetch_json(data)
            if not self._origin:
                self._origin = origin
            if not success:
                self._last_error = json_data
                return

            try:
                self._object_data = json.loads(json_data)
                self._json_data = json_data
                if self._schema is None:
                    self._ok = True
            except Exception as e:
                self._last_error = f'JSON parsing error: {e}'

        elif isinstance(data, dict) or isinstance(data, list):
            self._origin = type(data).__name__
            self._object_data = data
            if self._schema is None:
                self._ok = True

        if self._object_data is not None and not self._ok and self._schema is not None:
            try:
                validate(self._object_data, self._schema)
                self._ok = True
            except Exception as e:
                self._last_error = f'Schema error: {e}'