Exemple #1
0
def test_fixed_named_type():
    """https://github.com/fastavro/fastavro/issues/450"""
    schema = {
        "type":
        "record",
        "name":
        "test_fixed_named_type",
        "fields": [
            {
                "name": "test1",
                "type": {
                    "type": "fixed",
                    "name": "my_fixed",
                    "size": 4,
                },
            },
            {
                "name": "test2",
                "type": "my_fixed",
            },
        ],
    }

    record = {"test1": b"1234", "test2": b"4321"}
    parsed_schema = parse_schema(schema)
    validate(record, parsed_schema)
Exemple #2
0
def test_no_data(tmpdir):
    filepath = tmpdir + "no_data.avro"
    schema = {
        "name":
        "Weather",
        "type":
        "record",
        "fields": [
            {
                "name": "station",
                "type": "string"
            },
            {
                "name": "time",
                "type": "long"
            },
            {
                "name": "temp",
                "type": "int"
            },
        ],
    }
    parsed_schema = fa.parse_schema(schema)
    with open(filepath, "wb") as out:
        fa.writer(out, parsed_schema, [])

    df = cudf.read_avro(filepath)

    # fastavro returns an empty dataframe, need to verify manually
    assert_eq(df.shape, (0, 3))
    dtypes = df.dtypes.values.tolist()
    assert_eq(dtypes, [np.dtype("O"), np.dtype("int64"), np.dtype("int32")])
    col_names = df.columns.tolist()
    assert_eq(col_names, ["station", "time", "temp"])
Exemple #3
0
class ClickEvent:
    email: str = field(default_factory=faker.email)
    timestamp: str = field(default_factory=faker.iso8601)
    uri: str = field(default_factory=faker.uri)
    number: int = field(default_factory=lambda: random.randint(0, 999))
    attributes: dict = field(default_factory=ClickAttribute.attributes)

    schema = parse_schema({
        "type":
        "record",
        "name":
        "click_event",
        "namespace":
        "com.udacity.lesson3.exercise2",
        "fields": [
            {
                "name": "email",
                "type": "string"
            },
            {
                "name": "timestamp",
                "type": "string"
            },
            {
                "name": "uri",
                "type": "string"
            },
            {
                "name": "number",
                "type": "int"
            },
            #
            # TODO: Add the attributes map!
            {
                "name": "attributes",
                "type": {
                    "type": "map",
                    "values": {
                        "type":
                        "record",
                        "name":
                        "attribute",
                        "fields": [{
                            "name": "element",
                            "type": "string"
                        }, {
                            "name": "content",
                            "type": "string"
                        }]
                    }
                }
            }
        ]
    })

    def serialize(self):
        """Serializes the ClickEvent for sending to Kafka"""
        out = BytesIO()
        writer(out, ClickEvent.schema, [asdict(self)])
        return out.getvalue()
Exemple #4
0
def test_enum_named_type():
    """https://github.com/fastavro/fastavro/issues/450"""
    schema = {
        "type":
        "record",
        "name":
        "test_enum_named_type",
        "fields": [
            {
                "name": "test1",
                "type": {
                    "type": "enum",
                    "name": "my_enum",
                    "symbols": ["FOO", "BAR"],
                },
            },
            {
                "name": "test2",
                "type": "my_enum",
            },
        ],
    }

    record = {"test1": "FOO", "test2": "BAR"}
    parsed_schema = parse_schema(schema)
    validate(record, parsed_schema)
Exemple #5
0
def write_avro():
    schema = {
        "namespace": "sample.avro",
        "type": "record",
        "name": "Cars",
        "fields": [
            {"name": "model", "type": "string"},
            {"name": "make", "type": ["string", "null"]},
            {"name": "year", "type": ["int", "null"]}
        ]
    }

    records = [
        {"model": "ABC", "make": "Audi", "year": 2010},
        {"model": "ABC", "make": "Audi", "year": 2010},
        {"model": "ABC", "make": "Audi", "year": 2010},
        {"model": "ABC", "make": "Audi", "year": 2010},
        {"model": "ABC", "make": "Audi", "year": 2010},
        {"model": "EDF", "make": "BEWM"},
        {"model": "EDF", "make": "BEWM"},
        {"model": "EDF", "make": "BEWM"},
        {"model": "EDF", "make": "BEWM"},
        {"model": "EDF", "make": "BEWM"},
        {"model": "XCV", "year": 2010},
        {"model": "XCV", "year": 2010},
        {"model": "XCV", "year": 2010},
        {"model": "XCV", "year": 2010},
        {"model": "XCV", "year": 2010}
    ]

    os.makedirs(os.path.join('.', 'data'), exist_ok=True)
    with open(os.path.join('.', 'data', 'cars.avro'), 'wb') as avro_file:
        writer(avro_file, parse_schema(schema), records)
    with open(os.path.join('.', 'data', 'cars.json'), 'w') as json_file:
        json.dump(records, json_file)
Exemple #6
0
def encode_into_avro(alert: dict, schema_file: str) -> str:
    """Encode a dict record into avro bytes

    Parameters
    ----------
    alert: dict
        A Dictionary of alert data
    schema_file: str
        Path of avro schema file

    Returns
    ----------
    value: str
        a bytes string with avro encoded alert data

    Examples
    ----------
    >>> r = AlertReader(avro_single_alert)
    >>> alert = r.to_list(size=1)[0]
    >>> avro_encoded = encode_into_avro(alert, schema_path)
    """
    with open(schema_file) as f:
        schema = json.load(f)

    parsed_schema = fastavro.parse_schema(schema)
    b = io.BytesIO()
    fastavro.schemaless_writer(b, parsed_schema, alert)

    return b.getvalue()
Exemple #7
0
def load_schema(name, suffix=""):
    """Load an Avro schema from the local app data.

    This function is memoized so that repeated calls are fast.

    Parameters
    ----------
    name : `str`
        Name of the schema. This should be a fully-qualified name that matches
        the file name of schemas in the ``schemas/`` directory of the source
        repository.
    suffix : `str`, optional
        A suffix to add to the schema's name. This is typically used to create
        "staging" schemas, therefore "staging subjects" in the Schema Registry.

    Returns
    -------
    schema : `dict`
        A schema object.
    """
    schemas_dir = Path(__file__).parent / "schemas"
    schema_path = schemas_dir / f"{name}.json"

    schema = json.loads(schema_path.read_text())

    if suffix:
        schema["name"] = "".join((schema["name"], suffix))

    return fastavro.parse_schema(schema)
class ClickEvent:
    email: str = field(default_factory=faker.email)
    timestamp: str = field(default_factory=faker.iso8601)
    uri: str = field(default_factory=faker.uri)
    number: int = field(default_factory=lambda: random.randint(0, 999))

    #
    # TODO: Define an Avro Schema for this ClickEvent
    #       See: https://avro.apache.org/docs/1.8.2/spec.html#schema_record
    #       See: https://fastavro.readthedocs.io/en/latest/schema.html?highlight=parse_schema#fastavro-schema
    #
    schema = parse_schema({
        "type"
    })

    def serialize(self):
        """Serializes the ClickEvent for sending to Kafka"""
        #
        # TODO: Rewrite the serializer to send data in Avro format
        #       See: https://fastavro.readthedocs.io/en/latest/schema.html?highlight=parse_schema#fastavro-schema
        #
        # HINT: Python dataclasses provide an `asdict` method that can quickly transform this
        #       instance into a dictionary!
        #       See: https://docs.python.org/3/library/dataclasses.html#dataclasses.asdict
        #
        # HINT: Use BytesIO for your output buffer. Once you have an output buffer instance, call
        #       `getvalue() to retrieve the data inside the buffer.
        #       See: https://docs.python.org/3/library/io.html?highlight=bytesio#io.BytesIO
        #
        return json.dumps(
            {"uri": self.uri, "timestamp": self.timestamp, "email": self.email}
        )
Exemple #9
0
    def __write(self, data):
        """Write data into the avro file."""
        if not self.__schema:
            self.__schema = parse_schema(
                self.__avro_translator.deduce_scheme(data[0]))

        fast_writer(self.__descriptor, self.__schema, data, self.__codec)
Exemple #10
0
def cudf_from_avro_util(schema, records):

    schema = [] if schema is None else fastavro.parse_schema(schema)
    buffer = io.BytesIO()
    fastavro.writer(buffer, schema, records)
    buffer.seek(0)
    return cudf.read_avro(buffer)
Exemple #11
0
    def __init__(self, file_, data=None, schema=None, codec: str = 'snappy'):
        """Create an avro archive.

        Note:
            Specification: https://avro.apache.org/docs/1.8.2/spec.html

        Args:
            file_ (str, BytesIO, IOBase): the output file_
            data (dict, list(dict)): the data to write
            schema (dict): the avro schema as dictionary

        Returns:
            AvroDataFileWriter: the instance of this object
        """
        self.__descriptor = None
        if isinstance(file_, str):
            self.__descriptor = open(file_, 'wb')
        elif isinstance(file_, (BytesIO, IOBase)):
            self.__descriptor = file_
        else:
            raise Exception("Type '{}' for file_ is not supported...".format(
                type(file_)))
        self.__schema = None
        self.__codec = codec
        self.__avro_translator = AvroObjectTranslator()
        if schema:
            self.__schema = parse_schema(schema)
        if data is not None:
            self.append(data)
    def _write_inference_result(self, sample_ids, labels, weights, scores,
                                scores_and_offsets, task_index, schema_params, output_dir):
        """ Write inference results. """
        photon_ml_writer = PhotonMLWriter(schema_params=schema_params)
        output_avro_schema = photon_ml_writer.get_inference_output_avro_schema(
            self.metadata,
            self._has_label(schema_params[constants.LABEL]),
            True,
            has_weight=self._has_feature(schema_params[constants.SAMPLE_WEIGHT]))
        parsed_schema = parse_schema(output_avro_schema)

        records = []
        for rec_id, rec_label, rec_weight, rec_score, rec_score_and_offset in \
                zip(sample_ids, labels, weights, scores, scores_and_offsets):
            rec = {schema_params[constants.SAMPLE_ID]: int(rec_id),
                   schema_params[constants.PREDICTION_SCORE]: float(rec_score),
                   schema_params[constants.PREDICTION_SCORE_PER_COORDINATE]: float(rec_score_and_offset)
                   }
            if self._has_label(schema_params[constants.LABEL]):
                rec[schema_params[constants.LABEL]] = int(rec_label)
            if self._has_feature(schema_params[constants.SAMPLE_WEIGHT]):
                rec[schema_params[constants.SAMPLE_WEIGHT]] = int(rec_weight)
            records.append(rec)

        output_file = os.path.join(output_dir, "part-{0:05d}.avro".format(task_index))
        error_msg = "worker {} encountered error in writing inference results".format(task_index)
        with tf1.gfile.GFile(output_file, 'wb') as f:
            try_write_avro_blocks(f, parsed_schema, records, None, error_msg)
        logging("Worker {} saved inference result to {}".format(task_index, output_file))
Exemple #13
0
def _write_toavro(table,
                  target,
                  mode,
                  schema,
                  sample,
                  codec='deflate',
                  compression_level=None,
                  **avro_args):
    if table is None or len(table) <= 0:
        return
    # build a schema when not defined by user
    if not schema:
        schema, table2 = _build_schema_from_values(table, sample)
    else:
        table2 = _fix_missing_headers(table, schema)

    # fastavro expects a iterator of dicts
    rows = dicts(table2) if PY3 else _ordered_dict_iterator(table2)

    with target.open(mode) as target_file:
        # delay the import of fastavro for not breaking when unused
        import fastavro
        parsed_schema = fastavro.parse_schema(schema)
        # this could raise a error when any value is not of supported tupe
        fastavro.writer(fo=target_file,
                        schema=parsed_schema,
                        records=rows,
                        codec=codec,
                        codec_compression_level=compression_level,
                        **avro_args)
Exemple #14
0
class ClickEvent:
    email: str = field(default_factory=faker.email)
    timestamp: str = field(default_factory=faker.iso8601)
    uri: str = field(default_factory=faker.uri)
    number: int = field(default_factory=lambda: random.randint(0, 999))
    attributes: dict = field(default_factory=ClickAttribute.attributes)

    #
    # TODO: Update this Avro schema to include a map of attributes
    #       See: https://avro.apache.org/docs/1.8.2/spec.html#Maps
    #
    schema = parse_schema(
        {
            "type": "record",
            "name": "click_event",
            "namespace": "com.udacity.lesson3.exercise2",
            "fields": [
                {"name": "email", "type": "string"},
                {"name": "timestamp", "type": "string"},
                {"name": "uri", "type": "string"},
                {"name": "number", "type": "int"}
                #
                # TODO: Add the attributes map!
                #
            ],
        }
    )

    def serialize(self):
        """Serializes the ClickEvent for sending to Kafka"""
        out = BytesIO()
        writer(out, ClickEvent.schema, [asdict(self)])
        return out.getvalue()
Exemple #15
0
class Purchase:
    username: str = field(default_factory=faker.user_name)
    currency: str = field(default_factory=faker.currency_code)
    amount: int = field(default_factory=lambda: random.randint(100, 200000))

    schema = parse_schema({
        "type":
        "record",
        "name":
        "purchase",
        "namespace":
        "com.udacity.lesson3.sample2",
        "fields": [
            {
                "name": "username",
                "type": "string"
            },
            {
                "name": "currency",
                "type": "string"
            },
            {
                "name": "amount",
                "type": "int"
            },
        ],
    })

    def serialize(self):
        #
        # TODO: Modify the following sample to use Avro instead of JSON
        #
        out = io.BytesIO()
        writer(out, Purchase.schema, [asdict(self)])
        return out.getvalue()
def read_schema() -> fastavro.schema:
    with open("avro/schema/schema-SKDB.public.sdcocdmst.json", "r") as file:
        schema = json.load(file)
        schema = json.loads(schema["schema"])
        schema = fastavro.parse_schema(schema)

    return schema
Exemple #17
0
def fix_record_for_avro(record, avro_schema):
    for field in avro_schema['fields']:
        field_name = field['name']
        datatype = field['type']
        if isinstance(datatype, dict):
            # This is a record type definition so we need to recurse a level deeper.
            record[field_name] = fix_record_for_avro(
                record[field_name], fastavro.parse_schema(datatype))[0]
        elif isinstance(datatype, list) and isinstance(datatype[1], dict):
            logical_type = datatype[1].get('logicalType', None)
            if logical_type:
                if logical_type.find('-') > -1:
                    logical_prefix, precision = logical_type.split('-')
                else:
                    logical_prefix = logical_type
                    precision = None
                if logical_prefix == 'timestamp':
                    is_micros = (precision == 'micros')
                    record[field_name] = datetime_to_epoch_timestamp(
                        record[field_name], micros=is_micros)
                elif logical_type == 'date':
                    record[field_name] = date_to_epoch_date(record[field_name])
                elif logical_prefix == 'time':
                    is_micros = (precision == 'micros')

                    record[field_name] = time_to_epoch_time(record[field_name],
                                                            micros=is_micros)
    return [record]
Exemple #18
0
def gen_schema():
    schema = {
        'doc':
        'Twitter',
        'name':
        'Twitter',
        'namespace':
        'twitter',
        'type':
        'record',
        'fields': [
            {
                'name': 'twitter_name',
                'type': 'string'
            },
            {
                'name': 'twitter_token',
                'type': 'float'
            },
            {
                'name': 'uuid',
                'type': 'string'
            },
        ],
    }
    parsed_schema = parse_schema(schema)

    return parsed_schema
def test_serde_to_avro(pytestconfig: PytestConfig, json_filename: str) -> None:
    # In this test, we want to read in from JSON -> MCE object.
    # Next we serialize from MCE to Avro and then deserialize back to MCE.
    # Finally, we want to compare the two MCE objects.

    json_path = pytestconfig.rootpath / json_filename
    mces = list(iterate_mce_file(str(json_path)))

    # Serialize to Avro.
    parsed_schema = fastavro.parse_schema(
        json.loads(getMetadataChangeEventSchema()))
    fo = io.BytesIO()
    out_records = [mce.to_obj(tuples=True) for mce in mces]
    fastavro.writer(fo, parsed_schema, out_records)

    # Deserialized from Avro.
    fo.seek(0)
    in_records = list(fastavro.reader(fo, return_record_name=True))
    in_mces = [
        MetadataChangeEventClass.from_obj(record, tuples=True)
        for record in in_records
    ]

    # Check diff
    assert len(mces) == len(in_mces)
    for i in range(len(mces)):
        assert mces[i] == in_mces[i]
Exemple #20
0
class ClickEvent:
    email: str = field(default_factory=faker.email)
    timestamp: str = field(default_factory=faker.iso8601)
    uri: str = field(default_factory=faker.uri)
    number: int = field(default_factory=lambda: random.randint(0, 999))

    schema = parse_schema({
        "type":
        "record",
        "name":
        "click_event",
        "namespace":
        "com.hcvn.bi",
        "fields": [{
            "name": "email",
            "type": "string"
        }, {
            "name": "timestamp",
            "type": "string"
        }, {
            "name": "uri",
            "type": "string"
        }, {
            "name": "number",
            "type": "int"
        }]
    })

    def serialize(self):
        out = BytesIO()
        writer(out, ClickEvent.schema, [asdict(self)])
        return out.getvalue()
Exemple #21
0
def test_parse_schema_accepts_nested_records_from_arrays():
    parsed_schema = fastavro.parse_schema({
        "fields": [{
            "type": {
                "items": {
                    "type": "record",
                    "fields": [{
                        "type": "string",
                        "name": "text"
                    }],
                    "name": "Nested"
                },
                "type": "array",
            },
            "name": "multiple"
        }, {
            "type": {
                "type": "array",
                "items": "Nested"
            },
            "name": "single"
        }],
        "type":
        "record",
        "name":
        "test_parse_schema_accepts_nested_records_from_arrays",
    })
    assert "Nested" == parsed_schema["fields"][1]["type"]["items"]
    def _write_inference_result(self, sample_ids, labels, weights, prediction_score,
                                prediction_score_per_coordinate, task_index, schema_params: SchemaParams, output_dir):
        """ Write inference results. """
        output_avro_schema = get_inference_output_avro_schema(
            self.metadata,
            True,
            schema_params,
            has_weight=self._has_feature(schema_params.weight_column_name))
        parsed_schema = parse_schema(output_avro_schema)

        records = []
        for rec_id, rec_label, rec_weight, rec_prediction_score, rec_prediction_score_per_coordinate in \
                zip(sample_ids, labels, weights, prediction_score, prediction_score_per_coordinate):
            rec = {schema_params.uid_column_name: int(rec_id),
                   schema_params.prediction_score_column_name: float(rec_prediction_score),
                   schema_params.prediction_score_per_coordinate_column_name: float(rec_prediction_score_per_coordinate)}
            if self._has_label(schema_params.label_column_name):
                rec[schema_params.label_column_name] = int(rec_label)
            if self._has_feature(schema_params.weight_column_name):
                rec[schema_params.weight_column_name] = int(rec_weight)
            records.append(rec)

        output_file = os.path.join(output_dir, f"part-{task_index:05d}.avro")
        error_msg = f"worker {task_index} encountered error in writing inference results"
        with tf1.gfile.GFile(output_file, 'wb') as f:
            try_write_avro_blocks(f, parsed_schema, records, None, error_msg)
        logging(f"Worker {task_index} saved inference result to {output_file}")
Exemple #23
0
def test_parse_schema_resolves_references_from_unions():
    parsed_schema = fastavro.parse_schema({
        "namespace":
        "com.other",
        "name":
        "Outer",
        "type":
        "record",
        "fields": [
            {
                "name":
                "a",
                "type": [
                    "null", {
                        "type": "record",
                        "name": "Inner",
                        "fields": [{
                            "name": "the_thing",
                            "type": "string"
                        }]
                    }
                ]
            },
            {
                "name": "b",
                # This should resolve to com.example.Inner because of the
                # `namespace` of the enclosing record.
                "type": ["null", "Inner"]
            }
        ]
    })
    assert "com.other.Inner" == parsed_schema["fields"][1]["type"][1]
Exemple #24
0
def test_order_of_values_in_map():
    """https://github.com/fastavro/fastavro/issues/303"""
    schema = {
        'doc':
        'A weather reading.',
        'name':
        'Weather',
        'namespace':
        'test',
        'type':
        'record',
        'fields': [{
            'name': 'metadata',
            'type': {
                'type':
                'map',
                'values': [{
                    'type': 'array',
                    'items': 'string'
                }, {
                    'type': 'map',
                    'values': ['string']
                }]
            }
        }],
    }
    parsed_schema = fastavro.parse_schema(schema)

    records = [{'metadata': {'map1': {'map2': 'str'}}}]

    assert records == roundtrip(parsed_schema, records)
Exemple #25
0
    def __init__(
        self,
        file_path_prefix=None,
        location=None,
        schema=parse_schema(KLIO_SCHEMA_OBJ),
        codec="deflate",
        file_name_suffix="",
        num_shards=0,
        shard_name_template=None,
        mime_type="application/x-avro",
    ):

        file_path = self._get_file_path(file_path_prefix, location)

        super(KlioWriteToAvro, self).__init__(
            file_path_prefix=file_path,
            schema=schema,
            codec=codec,
            file_name_suffix=file_name_suffix,
            num_shards=num_shards,
            shard_name_template=shard_name_template,
            mime_type=mime_type,
            use_fastavro=True,
        )

        self._sink = _KlioFastAvroSink(
            file_path,
            schema,
            codec,
            file_name_suffix,
            num_shards,
            shard_name_template,
            mime_type,
        )
Exemple #26
0
def test_parse_schema_accepts_nested_namespaces():
    parsed_schema = fastavro.parse_schema({
        "namespace":
        "com.example",
        "name":
        "Outer",
        "type":
        "record",
        "fields": [
            {
                "name": "a",
                "type": {
                    "type": "record",
                    "name": "Inner",
                    "fields": [{
                        "name": "the_thing",
                        "type": "string"
                    }]
                }
            },
            {
                "name": "b",
                # This should resolve to com.example.Inner because of the
                # `namespace` of the enclosing record.
                "type": "Inner"
            },
            {
                "name": "b",
                "type": "com.example.Inner"
            }
        ]
    })
    assert "com.example.Inner" == parsed_schema["fields"][0]["type"]["name"]
    assert "com.example.Inner" == parsed_schema["fields"][1]["type"]
def test_newer_versions_of_named_schemas_2():
    """https://github.com/fastavro/fastavro/issues/450"""
    schema = {
        "name":
        "Weather",
        "type":
        "record",
        "fields": [
            {
                "name": "place1",
                "type": {
                    "name": "Location",
                    "type": "record",
                    "fields": [{
                        "name": "city",
                        "type": "string"
                    }],
                },
            },
            {
                "name": "place2",
                "type": "Location",
            },
        ],
    }

    example_1 = {"place1": {"city": "London"}, "place2": {"city": "Berlin"}}
    parsed_schema = fastavro.parse_schema(schema)

    assert example_1 == roundtrip(parsed_schema, example_1)
Exemple #28
0
def main(args):
    schema_row_size = struct.calcsize(args.format)
    parsed_schema = fastavro.parse_schema(schema(args.format))

    file_path = Path(args.bin_input)
    file_size = file_path.stat().st_size

    out_filename = (args.output if args.output else filename_from_args(
        args, prefix='sima.exported.', postfix='.avro'))

    if file_size % schema_row_size != 0:
        msg = 'Format {} does not fit into file {}'.format(
            args.format, args.bin_input)
        raise ExportError(msg)

    with open(str(file_path), 'rb') as binf, open(out_filename, 'wb') as avrf:
        records = ({
            "col_{}".format(i): v
            for i, v in enumerate(struct.unpack(args.format, piece))
        } for piece in read_in_chunks(binf, schema_row_size))
        fastavro.writer(avrf,
                        parsed_schema,
                        records,
                        codec='deflate',
                        codec_compression_level=4)
Exemple #29
0
def test_record_named_type():
    """https://github.com/fastavro/fastavro/issues/450"""
    schema = {
        "type":
        "record",
        "name":
        "test_record_named_type",
        "fields": [{
            "name": "test1",
            "type": {
                "type": "record",
                "name": "my_record",
                "fields": [{
                    "name": "field1",
                    "type": "string",
                }]
            },
        }, {
            "name": "test2",
            "type": "my_record",
        }]
    }

    record = {"test1": {"field1": "foo"}, "test2": {"field1": "bar"}}
    parsed_schema = parse_schema(schema)
    validate(record, parsed_schema)
Exemple #30
0
def write_garmin_file_object_to_file(gfile, avro_file):
    parsed_schema = fastavro.parse_schema(GarminFile._avro_schema)
    js = [gfile.to_dict()]
    with gzip.open('%s.tmp' % avro_file, 'wb') as f:
        fastavro.writer(f, parsed_schema, js, validator=True)
    os.rename('%s.tmp' % avro_file, avro_file)
    return True
Exemple #31
0
def validater(schema, records, runs=1):
    times = []
    valid = []
    schema = parse_schema(schema)
    for _ in range(runs):
        start = time.time()
        valid = validate_many(records, schema)
        end = time.time()
        times.append(end - start)
    print('... {0} runs averaged {1} seconds'.format(runs, (sum(times) / runs)))
    return valid
Exemple #32
0
def write(schema, records, runs=1):
    times = []
    schema = parse_schema(schema)
    for _ in range(runs):
        iostream = BytesIO()
        start = time.time()
        writer(iostream, schema, records)
        end = time.time()
        times.append(end - start)
    print('... {0} runs averaged {1} seconds'.format(runs, (sum(times) / runs)))
    return iostream
Exemple #33
0
def read_schemaless(iostream, schema, num_records, runs=1):
    times = []
    schema = parse_schema(schema)
    for _ in range(runs):
        for _ in range(num_records):
            iostream.seek(0)
            start = time.time()
            record = schemaless_reader(iostream, schema)
            end = time.time()
            times.append(end - start)
    print('... {0} runs averaged {1} seconds'.format(runs, (sum(times) / runs)))
    return records
def _bq_to_avro_blocks(bq_blocks, avro_schema_json):
    avro_schema = fastavro.parse_schema(avro_schema_json)
    avro_blocks = []
    for block in bq_blocks:
        blockio = six.BytesIO()
        for row in block:
            fastavro.schemaless_writer(blockio, avro_schema, row)

        response = bigquery_storage_v1beta1.types.ReadRowsResponse()
        response.avro_rows.row_count = len(block)
        response.avro_rows.serialized_binary_rows = blockio.getvalue()
        avro_blocks.append(response)
    return avro_blocks
def _avro_schema(read_session):
    """Extract and parse Avro schema from a read session.

    Args:
        read_session ( \
            ~google.cloud.bigquery_storage_v1beta1.types.ReadSession \
        ):
            The read session associated with this read rows stream. This
            contains the schema, which is required to parse the data
            blocks.

    Returns:
        Tuple[fastavro.schema, Tuple[str]]:
            A parsed Avro schema, using :func:`fastavro.schema.parse_schema`
            and the column names for a read session.
    """
    json_schema = json.loads(read_session.avro_schema.schema)
    column_names = tuple((field["name"] for field in json_schema["fields"]))
    return fastavro.parse_schema(json_schema), column_names
Exemple #36
0
SCHEMA = {
    "namespace": "big_obj",
    "type": "record",
    "name": "DirFolders",
    "fields": [
        {"name": "base_folder", "type": {
            "type": "record",
            "name": "Folder",
            "fields": [
                {"name": "path", "type": "string"},
                {"name": "last_mod", "type": ["null", "string"], "default": "null"},
                {"name": "files", "type": ["null", {"type": "array", "items": {
                    "type": "record",
                    "name": "File",
                    "fields": [
                        {"name": "file_name", "type": "string"},
                        {"name": "content", "type": "bytes"},
                        {"name": "last_mod", "type": ["null", "string"], "default": "null"}
                    ]
                }}], "default": "null"},
                {"name": "folders", "type":
                                             {"type": "array", "items": "Folder"}}
            ]}},

        {"name": "details", "type": "string"},
    ]
}

avro_schema = fastavro.parse_schema(SCHEMA)