def read_csv_data(input_data, single_record = False, line_terminator = '\r\n'): """Return the contents of ``input_data`` as a :class:`str <python:str>`. :param input_data: The CSV data to read. .. note:: If ``input_data`` is Path-like, then the underlying file **must** start with a header row. :type input_data: Path-like or :class:`str <python:str>` :param single_record: If ``True``, will return only the first data record. If ``False``, will return all data records (including the header row if present). Defaults to ``False``. :type single_record: :class:`bool <python:bool>` :returns: ``input_data`` as a :class:`str <python:str>` :rtype: :class:`str <python:str>` or Path-like object """ try: input_data = input_data.strip() except AttributeError: pass original_input_data = input_data if checkers.is_file(input_data) and not single_record: with open(input_data, 'r') as input_file: input_data = input_file.read() elif checkers.is_file(input_data) and single_record: input_data = linecache.getline(original_input_data, 2) if input_data == '': input_data = linecache.getline(original_input_data, 1) if input_data == '': input_data = None elif single_record: try: if line_terminator in input_data: parsed_data = input_data.split(line_terminator) elif line_terminator == '\r\n' and '\r' in input_data: parsed_data = input_data.split('\r') elif line_terminator == '\r\n' and '\n' in input_data: parsed_data = input_data.split('\n') else: parsed_data = [input_data] except TypeError: parsed_data = [input_data] if not parsed_data: input_data = None elif len(parsed_data) == 1: input_data = parsed_data[0] else: input_data = parsed_data[1] return input_data
def parse_json(input_data, deserialize_function=None, **kwargs): """De-serialize JSON data into a Python :class:`dict <python:dict>` object. :param input_data: The JSON data to de-serialize. :type input_data: :class:`str <python:str>` :param deserialize_function: Optionally override the default JSON deserializer. Defaults to :obj:`None <python:None>`, which calls the default :ref:`simplejson.loads() <simplejson:simplejson.loads>` function from the `simplejson <https://github.com/simplejson/simplejson>`_ library. .. note:: Use the ``deserialize_function`` parameter to override the default YAML deserializer. A valid ``deserialize_function`` is expected to accept a single :class:`str <python:str>` and return a :class:`dict <python:dict>`, similar to :ref:`simplejson.loads() <simplejson:simplejson.loads>` If you wish to pass additional arguments to your ``deserialize_function`` pass them as keyword arguments (in ``kwargs``). :type deserialize_function: callable / :obj:`None <python:None>` :param kwargs: Optional keyword parameters that are passed to the JSON deserializer function. By default, these are options which are passed to :ref:`simplejson.loads() <simplejson:simplejson.loads>`. :type kwargs: keyword arguments :returns: A :class:`dict <python:dict>` representation of ``input_data``. :rtype: :class:`dict <python:dict>` """ is_file = False if checkers.is_file(input_data): is_file = True if deserialize_function is None and not is_file: deserialize_function = json.loads elif deserialize_function is None and is_file: deserialize_function = json.load else: if checkers.is_callable(deserialize_function) is False: raise ValueError('deserialize_function (%s) is not callable' % deserialize_function) if not input_data: raise DeserializationError('input_data is empty') if not is_file: try: input_data = validators.string(input_data, allow_empty=False) except ValueError: raise DeserializationError('input_data is not a valid string') from_json = deserialize_function(input_data, **kwargs) else: with open(input_data, 'r') as input_file: from_json = deserialize_function(input_file, **kwargs) return from_json
def load(source) -> etree._Element: # pylint: disable=protected-access ''' Load an XML document args: source: XML source. Either path, url, string, or loaded LXML Element returns: Loaded XML object tree, or None on invalid source ''' if not isinstance(source, (str, bytes)) or len(source) < 1: # pylint: disable=protected-access return source if isinstance(source, etree._ElementTree) else None source = source.strip() if source[0] == ord('<'): # Handle source as bytes source = io.BytesIO(source) elif source[0] == '<': # Handle source as string source = io.StringIO(source) elif checkers.is_file(source): # Handle source as local file pass # etree.parse handles local file paths natively elif checkers.is_url(source): # Handle source as URL response = requests.get(source, timeout=10) if not response: app.logger.warning( f"Failed to retrieve XML URL (or timed out): {source}") return None source = io.BytesIO(response.content) else: app.logger.warning( f"XML source is not valid file, URL, or XML string. {source[:40]}" + (len(source) > 40) * '...') return None return etree.parse(source)
def main(payload, endpoint, processes, threads, samples, time_based): file_type = None if checkers.is_url(payload): if payload.lower().endswith(".json"): file_type = "json" payload_data = requests.get(payload).json() elif payload.lower().endswith(".jpg"): file_type = "jpg" payload_data = imageio.imread(payload) elif checkers.is_file(payload): if payload.lower().endswith(".json"): file_type = "json" with open(payload, "r") as f: payload_data = json.load(f) elif payload.lower().endswith(".jpg"): file_type = "jpg" payload_data = cv2.imread(payload, cv2.IMREAD_COLOR) else: print(f"'{payload}' isn't an URL resource, nor is it a local file") sys.exit(1) if file_type is None: print(f"'{payload}' doesn't point to a jpg image or to a json file") sys.exit(1) if file_type == "jpg": data = image_to_jpeg_bytes(payload_data) if file_type == "json": data = json.dumps(payload_data) print("Starting the inference throughput test...") results = [] start = time.time() with concurrent.futures.ProcessPoolExecutor( max_workers=processes) as executor: results = executor_submitter(executor, processes, process_worker, threads, data, endpoint, samples, time_based) end = time.time() elapsed = end - start total_requests = sum(results) print( f"A total of {total_requests} requests have been served in {elapsed} seconds" ) print(f"Avg number of inferences/sec is {total_requests / elapsed}") print( f"Avg time spent on an inference is {elapsed / total_requests} seconds" )
def from_yaml(as_yaml: Union[str, 'PathLike[Any]', BytesIO], target: Optional[Union['PathLike[Any]', BytesIO]] = None, compress: bool = False, **kwargs): """Convert YAML data into an SPSS dataset. .. tip:: If you pass any additional keyword arguments, those keyword arguments will be passed onto the :meth:`DataFrame.from_dict() <pandas:pandas.DataFrame.from_dict>` method. :param as_yaml: The YAML data that you wish to convert into an SPSS dataset. :type as_yaml: :class:`str <python:str>` / File-location / :class:`BytesIO <python:io.BytesIO>` :param target: The target to which the SPSS dataset should be written. Accepts either a filename/path, a :class:`BytesIO <python:io.BytesIO>` object, or :obj:`None <python:None>`. If :obj:`None <python:None>` will return a :class:`BytesIO <python:io.BytesIO>` object containing the SPSS dataset. Defaults to :obj:`None <python:None>`. :type target: Path-like / :class:`BytesIO <python:io.BytesIO>` / :obj:`None <python:None>` :param compress: If ``True``, will return data in the compressed ZSAV format. If ``False``, will return data in the standards SAV format. Defaults to ``False``. :type compress: :class:`bool <python:bool>` :param kwargs: Additional keyword arguments which will be passed onto the :meth:`DataFrame.from_dict() <pandas:pandas.DataFrame.from_dict>` method. :type kwargs: :class:`dict <python:dict>` :returns: A :class:`BytesIO <python:io.BytesIO>` object containing the SPSS data if ``target`` is :obj:`None <python:None>` or not a filename, otherwise :obj:`None <python:None>` :rtype: :class:`BytesIO <python:io.BytesIO>` or :obj:`None <python:None>` """ if checkers.is_file(as_yaml) or checkers.is_bytesIO(as_yaml): file_path = as_yaml with open(file_path, 'rb') as yaml_file: as_dict = yaml.safe_load(yaml_file) else: as_yaml = validators.string(as_yaml, allow_empty=False) as_dict = yaml.safe_load(as_yaml) as_json = json.dumps(as_dict) return from_json(as_json, target=target, compress=compress, **kwargs)
def check_input_file(input_directory, input_value): inputs = os.path.abspath(input_directory) if not os.path.exists(input_directory): raise AssertionError('input directory (%s) does not exist' % inputs) elif not os.path.isdir(input_directory): raise AssertionError('input directory (%s) is not a directory' % inputs) try: input_file = os.path.join(input_directory, input_value) except (TypeError, AttributeError): input_file = None if input_file is not None and checkers.is_file(input_file): input_value = input_file return input_value
def test_read_csv_data(input_files, input_data, single_record, expected_result): inputs = os.path.abspath(input_files) if not os.path.exists(input_files): raise AssertionError('input directory (%s) does not exist' % inputs) elif not os.path.isdir(input_files): raise AssertionError('input directory (%s) is not a directory' % inputs) input_file = os.path.join(input_files, input_data) if checkers.is_file(input_file): input_data = input_file result = read_csv_data(input_data, single_record=single_record) if result is None: assert result == expected_result else: assert result.strip() == expected_result.strip()
def from_csv(cls, serialized, tablename, metadata, primary_key, column_kwargs=None, skip_nested=True, default_to_str=False, type_mapping=None, delimiter='|', wrap_all_strings=False, null_text='None', wrapper_character="'", double_wrapper_character_when_nested=False, escape_character="\\", line_terminator='\r\n', **kwargs): """Generate a :class:`Table` object from a :term:`CSV <Comma-Separated Value (CSV)>` string. .. versionadded: 0.3.0 :param serialized: The CSV data whose column headers will be treated as column names, while value data types will determine :term:`model attribute` data types. .. note:: If a Path-like object, will read the file contents from a file that is assumed to include a header row. If a :class:`str <python:str>` and has more than one record (line), will assume the first line is a header row. If a :class:`list <python:list>`, will assume the first item is the header row. :type serialized: :class:`str <python:str>` / Path-like object / :class:`list <python:list>` :param tablename: The name of the SQL table to which the model corresponds. :type tablename: :class:`str <python:str>` :param metadata: a :class:`MetaData <sqlalchemy:sqlalchemy.schema.MetaData>` object which will contain this table. The metadata is used as a point of association of this table with other tables which are referenced via foreign key. It also may be used to associate this table with a particular :class:`Connectable <sqlalchemy:sqlalchemy.engine.Connectable>`. :type metadata: :class:`MetaData <sqlalchemy:sqlalchemy.schema.MetaData>` :param primary_key: The name of the column/key that should be used as the table's primary key. :type primary_key: :class:`str <python:str>` :param column_kwargs: An optional dictionary whose keys correspond to column/key, and whose values are themselves dictionaries with keyword arguments that will be passed ot the applicable :class:`Column` constructor. Defaults to :obj:`None <python:None>`. :type column_kwargs: :class:`dict <python:dict>` / :obj:`None <python:None>` :param skip_nested: If ``True`` then any keys in ``serialized`` that feature nested items (e.g. iterables, :class:`dict <python:dict>` objects, etc.) will be ignored. If ``False``, will treat nested items as :class:`str <python:str>`. Defaults to ``True``. :type skip_nested: :class:`bool <python:bool>` :param default_to_str: If ``True``, will automatically set a key/column whose value type cannot be determined to ``str`` (:class:`Text <sqlalchemy:sqlalchemy.types.Text>`). If ``False``, will use the value type's ``__name__`` attribute and attempt to find a mapping. Defaults to ``False``. :type default_to_str: :class:`bool <python:bool>` :param type_mapping: Determines how value types in ``serialized`` map to SQL column data types. To add a new mapping or override a default, set a key to the name of the value type in Python, and set the value to a :doc:`SQLAlchemy Data Type <sqlalchemy:core/types>`. The following are the default mappings applied: .. list-table:: :widths: 30 30 :header-rows: 1 * - Python Literal - SQL Column Type * - ``bool`` - :class:`Boolean <sqlalchemy:sqlalchemy.types.Boolean>` * - ``str`` - :class:`Text <sqlalchemy:sqlalchemy.types.Text>` * - ``int`` - :class:`Integer <sqlalchemy:sqlalchemy.types.Integer>` * - ``float`` - :class:`Float <sqlalchemy:sqlalchemy.types.Float>` * - ``date`` - :class:`Date <sqlalchemy:sqlalchemy.types.Date>` * - ``datetime`` - :class:`DateTime <sqlalchemy:sqlalchemy.types.DateTime>` * - ``time`` - :class:`Time <sqlalchemy:sqlalchemy.types.Time>` :type type_mapping: :class:`dict <python:dict>` with type names as keys and column data types as values. :param delimiter: The delimiter used between columns. Defaults to ``|``. :type delimiter: :class:`str <python:str>` :param wrapper_character: The string used to wrap string values when wrapping is applied. Defaults to ``'``. :type wrapper_character: :class:`str <python:str>` :param null_text: The string used to indicate an empty value if empty values are wrapped. Defaults to `None`. :type null_text: :class:`str <python:str>` :param kwargs: Any additional keyword arguments will be passed to the :class:`Table` constructor. For a full list of options, please see :class:`sqlalchemy.schema.Table <sqlalchemy:sqlalchemy.schema.Table>`. :returns: A :class:`Table` object. :rtype: :class:`Table` :raises DeserializationError: if ``serialized`` is not a valid :class:`str <python:str>` :raises UnsupportedValueTypeError: when a value in ``serialized`` does not have a corresponding key in ``type_mapping`` :raises ValueError: if ``tablename`` is empty :raises ValueError: if ``primary_key`` is empty :raises CSVStructureError: if there are less than 2 (two) rows in ``serialized`` or if column headers are not valid Python variable names """ # pylint: disable=line-too-long,invalid-name,too-many-arguments if not checkers.is_file(serialized): serialized = read_csv_data(serialized, single_record=False) from_csv = parse_csv(serialized, delimiter=delimiter, wrap_all_strings=wrap_all_strings, null_text=null_text, wrapper_character=wrapper_character, double_wrapper_character_when_nested= double_wrapper_character_when_nested, escape_character=escape_character, line_terminator=line_terminator) table = cls.from_dict(from_csv, tablename, metadata, primary_key, column_kwargs=column_kwargs, skip_nested=skip_nested, default_to_str=default_to_str, type_mapping=type_mapping, **kwargs) return table
def parse_csv(input_data, delimiter='|', wrap_all_strings=False, null_text='None', wrapper_character="'", double_wrapper_character_when_nested=False, escape_character="\\", line_terminator='\r\n'): """De-serialize CSV data into a Python :class:`dict <python:dict>` object. .. versionadded:: 0.3.0 .. tip:: Unwrapped empty column values are automatically interpreted as null (:obj:`None <python:None>`). :param input_data: The CSV data to de-serialize. Should include column headers and at least **one** row of data. Will ignore any rows of data beyond the first row. :type input_data: :class:`str <python:str>` :param delimiter: The delimiter used between columns. Defaults to ``|``. :type delimiter: :class:`str <python:str>` :param wrapper_character: The string used to wrap string values when wrapping is applied. Defaults to ``'``. :type wrapper_character: :class:`str <python:str>` :param null_text: The string used to indicate an empty value if empty values are wrapped. Defaults to `None`. :type null_text: :class:`str <python:str>` :returns: A :class:`dict <python:dict>` representation of the CSV record. :rtype: :class:`dict <python:dict>` :raises DeserializationError: if ``input_data`` is not a valid :class:`str <python:str>` :raises CSVStructureError: if there are less than 2 (two) rows in ``input_data`` or if column headers are not valid Python variable names """ use_file = False if not checkers.is_file(input_data) and not checkers.is_iterable( input_data): try: input_data = validators.string(input_data, allow_empty=False) except (ValueError, TypeError): raise DeserializationError("input_data expects a 'str', received '%s'" \ % type(input_data)) input_data = [input_data] elif checkers.is_file(input_data): use_file = True if not wrapper_character: wrapper_character = '\'' if wrap_all_strings: quoting = csv.QUOTE_NONNUMERIC else: quoting = csv.QUOTE_MINIMAL if 'sqlathanor' in csv.list_dialects(): csv.unregister_dialect('sqlathanor') csv.register_dialect('sqlathanor', delimiter=delimiter, doublequote=double_wrapper_character_when_nested, escapechar=escape_character, quotechar=wrapper_character, quoting=quoting, lineterminator=line_terminator) if not use_file: csv_reader = csv.DictReader(input_data, dialect='sqlathanor', restkey=None, restval=None) rows = [x for x in csv_reader] else: if not is_py2: with open(input_data, 'r', newline='') as input_file: csv_reader = csv.DictReader(input_file, dialect='sqlathanor', restkey=None, restval=None) rows = [x for x in csv_reader] else: with open(input_data, 'r') as input_file: csv_reader = csv.DictReader(input_file, dialect='sqlathanor', restkey=None, restval=None) rows = [x for x in csv_reader] if len(rows) < 1: raise CSVStructureError( 'expected 1 row of data and 1 header row, missing 1') else: data = rows[0] for key in data: try: validators.variable_name(key) except ValueError: raise CSVStructureError( 'column (%s) is not a valid Python variable name' % key) if data[key] == null_text: data[key] = None csv.unregister_dialect('sqlathanor') return data
def generate_model_from_csv(serialized, tablename, primary_key, cls = BaseModel, serialization_config = None, skip_nested = True, default_to_str = False, type_mapping = None, base_model_attrs = None, delimiter = '|', wrap_all_strings = False, null_text = 'None', wrapper_character = "'", double_wrapper_character_when_nested = False, escape_character = "\\", line_terminator = '\r\n', **kwargs): """Generate a :term:`model class` from a serialized :term:`CSV <Comma-Separated Value (CSV)>` string. .. versionadded: 0.3.0 .. note:: This function *cannot* programmatically create :term:`relationships <relationship>`, :term:`hybrid properties <hybrid property>`, or :term:`association proxies <association proxy>`. :param serialized: The CSV data whose column headers will be treated as column names, while value data types will determine :term:`model attribute` data types. .. note:: If a Path-like object, will read the file contents from a file that is assumed to include a header row. If a :class:`str <python:str>` and has more than one record (line), will assume the first line is a header row. If a :class:`list <python:list>`, will assume the first item is the header row. :type serialized: :class:`str <python:str>` / Path-like object / :class:`list <python:list>` :param tablename: The name of the SQL table to which the model corresponds. :type tablename: :class:`str <python:str>` :param primary_key: The name of the column/key that should be used as the table's primary key. :type primary_key: :class:`str <python:str>` :param cls: The base class to use when generating a new :term:`model class`. Defaults to :class:`BaseModel` to provide serialization/de-serialization support. If a :class:`tuple <python:tuple>` of classes, will include :class:`BaseModel` in that list of classes to mixin serialization/de-serialization support. If not :obj:`None <python:None>` and not a :class:`tuple <python:tuple>`, will mixin :class:`BaseModel` with the value passed to provide serialization/de-serialization support. :type cls: :obj:`None <python:None>` / :class:`tuple <python:tuple>` of classes / class object :param serialization_config: Collection of :class:`AttributeConfiguration <sqlathanor.attributes.AttributeConfiguration>` that determine the generated model's :term:`serialization`/:term:`de-serialization` :ref:`configuration <configuration>`. If :obj:`None <python:None>`, will support serialization and de-serialization across all keys in ``serialized_dict``. Defaults to :obj:`None <python:None>`. :type serialization_config: Iterable of :class:`AttributeConfiguration <sqlathanor.attributes.AttributeConfiguration>` or coercable :class:`dict <python:dict>` objects / :obj:`None <python:None>` :param skip_nested: If ``True`` then any keys in ``serialized_json`` that feature nested items (e.g. iterables, JSON objects, etc.) will be ignored. If ``False``, will treat serialized items as :class:`str <python:str>`. Defaults to ``True``. :type skip_nested: :class:`bool <python:bool>` :param default_to_str: If ``True``, will automatically set a key/column whose value type cannot be determined to ``str`` (:class:`Text <sqlalchemy:sqlalchemy.types.Text>`). If ``False``, will use the value type's ``__name__`` attribute and attempt to find a mapping. Defaults to ``False``. :type default_to_str: :class:`bool <python:bool>` :param type_mapping: Determines how value types in ``serialized`` map to SQL column data types. To add a new mapping or override a default, set a key to the name of the value type in Python, and set the value to a :doc:`SQLAlchemy Data Type <sqlalchemy:core/types>`. The following are the default mappings applied: .. list-table:: :widths: 30 30 :header-rows: 1 * - Python Literal - SQL Column Type * - ``bool`` - :class:`Boolean <sqlalchemy:sqlalchemy.types.Boolean>` * - ``str`` - :class:`Text <sqlalchemy:sqlalchemy.types.Text>` * - ``int`` - :class:`Integer <sqlalchemy:sqlalchemy.types.Integer>` * - ``float`` - :class:`Float <sqlalchemy:sqlalchemy.types.Float>` * - ``date`` - :class:`Date <sqlalchemy:sqlalchemy.types.Date>` * - ``datetime`` - :class:`DateTime <sqlalchemy:sqlalchemy.types.DateTime>` * - ``time`` - :class:`Time <sqlalchemy:sqlalchemy.types.Time>` :type type_mapping: :class:`dict <python:dict>` with type names as keys and column data types as values. :param base_model_attrs: Optional :class:`dict <python:dict>` of special attributes that will be applied to the generated :class:`BaseModel <sqlathanor.declarative.BaseModel>` (e.g. ``__table_args__``). Keys will correspond to the attribute name, while the value is the value that will be applied. Defaults to :obj:`None <python:None>`. :type base_model_attrs: :class:`dict <python:dict>` / :obj:`None <python:None>` :param delimiter: The delimiter used between columns. Defaults to ``|``. :type delimiter: :class:`str <python:str>` :param wrapper_character: The string used to wrap string values when wrapping is applied. Defaults to ``'``. :type wrapper_character: :class:`str <python:str>` :param null_text: The string used to indicate an empty value if empty values are wrapped. Defaults to `None`. :type null_text: :class:`str <python:str>` :param kwargs: Any additional keyword arguments will be passed to :func:`declarative_base() <sqlathanor.declarative.declarative_base>` when generating the programmatic :class:`BaseModel <sqlathanor.declarative.BaseModel>`. :returns: :term:`Model class` whose structure matches ``serialized``. :rtype: :class:`BaseModel` :raises UnsupportedValueTypeError: when a value in ``serialized`` does not have a corresponding key in ``type_mapping`` :raises ValueError: if ``tablename`` is empty :raises DeserializationError: if ``serialized`` is not a valid :class:`str <python:str>` :raises CSVStructureError: if there are less than 2 (two) rows in ``serialized`` or if column headers are not valid Python variable names """ # pylint: disable=line-too-long,too-many-arguments if not checkers.is_file(serialized): serialized = read_csv_data(serialized, single_record = False) from_csv = parse_csv(serialized, delimiter = delimiter, wrap_all_strings = wrap_all_strings, null_text = null_text, wrapper_character = wrapper_character, double_wrapper_character_when_nested = double_wrapper_character_when_nested, escape_character = escape_character, line_terminator = line_terminator) generated_model = generate_model_from_dict(from_csv, tablename, primary_key, cls = cls, serialization_config = serialization_config, skip_nested = skip_nested, default_to_str = default_to_str, type_mapping = type_mapping, base_model_attrs = base_model_attrs, **kwargs) return generated_model
def test_from_yaml(input_files, input_data, tablename, primary_key, column_kwargs, skip_nested, default_to_str, type_mapping, expected_types, error): # pylint: disable=no-member,line-too-long input_data = check_input_file(input_files, input_data) if not checkers.is_file(input_data): input_data = yaml.dump(input_data) # pylint: disable=no-member,line-too-long if column_kwargs is None: column_kwargs = {} if error: with pytest.raises(error): result = Table.from_yaml(input_data, tablename=tablename, metadata=MetaData(), primary_key=primary_key, column_kwargs=column_kwargs, skip_nested=skip_nested, default_to_str=default_to_str, type_mapping=type_mapping) else: result = Table.from_yaml(input_data, tablename=tablename, metadata=MetaData(), primary_key=primary_key, column_kwargs=column_kwargs, skip_nested=skip_nested, default_to_str=default_to_str, type_mapping=type_mapping) assert isinstance(result, Table) assert result.name == tablename for key in column_kwargs: item_column = None for column in result.c: if column.name == key: item_column = column break assert item_column is not None for subkey in column_kwargs[key]: assert hasattr(item_column, subkey) is True item_value = getattr(item_column, subkey) if subkey == 'default': item_value = item_value.arg expected_value = column_kwargs[key][subkey] assert item_value == expected_value for item in expected_types: item_column = None for column in result.c: if item[0] == column.name: item_column = column break assert item_column is not None assert isinstance(item_column.type, item[1]) is True assert item_column.primary_key is (item[0] == primary_key)
def _read_spss(data: Union[bytes, BytesIO, 'os.PathLike[Any]'], limit: Optional[int] = None, offset: int = 0, exclude_variables: Optional[List[str]] = None, include_variables: Optional[List[str]] = None, metadata_only: bool = False, apply_labels: bool = False, labels_as_categories: bool = True, missing_as_NaN: bool = False, convert_datetimes: bool = True, dates_as_datetime64: bool = False, **kwargs): """Internal function that reads an SPSS (.sav or .zsav) file and returns a :class:`tuple <python:tuple>` with a Pandas :class:`DataFrame <pandas:pandas.DataFrame>` object and a metadata :class:`dict <python:dict>`. :param data: The SPSS data to load. Accepts either a series of bytes or a filename. :type data: Path-like filename, :class:`bytes <python:bytes>` or :class:`BytesIO <python:io.bytesIO>` :param limit: The number of records to read from the data. If :obj:`None <python:None>` will return all records. Defaults to :obj:`None <python:None>`. :type limit: :class:`int <python:int>` or :obj:`None <python:None>` :param offset: The record at which to start reading the data. Defaults to 0 (first record). :type offset: :class:`int <python:int>` :param exclude_variables: A list of the variables that should be ignored when reading data. Defaults to :obj:`None <python:None>`. :type exclude_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param include_variables: A list of the variables that should be explicitly included when reading data. Defaults to :obj:`None <python:None>`. :type include_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param metadata_only: If ``True``, will return no data records in the resulting :class:`DataFrame <pandas:pandas.DataFrame>` but will return a complete metadata :class:`dict <python:dict>`. Defaults to ``False``. :type metadata_only: :class:`bool <python:bool>` :param apply_labels: If ``True``, converts the numerically-coded values in the raw data to their human-readable labels. Defaults to ``False``. :type apply_labels: :class:`bool <python:bool>` :param labels_as_categories: If ``True``, will convert labeled or formatted values to Pandas :term:`categories <pandas:category>`. Defaults to ``True``. .. caution:: This parameter will only have an effect if the ``apply_labels`` parameter is ``True``. :type labels_as_categories: :class:`bool <python:bool>` :param missing_as_NaN: If ``True``, will return any missing values as :class:`NaN <pandas:NaN>`. Otherwise will return missing values as per the configuration of missing value representation stored in the underlying SPSS data. Defaults to ``False``, which applies the missing value representation configured in the SPSS data itself. :type missing_as_NaN: :class:`bool <python:bool>` :param convert_datetimes: if ``True``, will convert the native integer representation of datetime values in the SPSS data to Pythonic :class:`datetime <python:datetime.datetime>`, or :class:`date <python:datetime.date>`, etc. representations (or Pandas :class:`datetime64 <pandas:datetime64>`, depending on the ``dates_as_datetime64`` parameter). If ``False``, will leave the original integer representation. Defaults to ``True``. :type convert_datetimes: :class:`bool <python:bool>` :param dates_as_datetime64: If ``True``, will return any date values as Pandas :class:`datetime64 <pandas.datetime64>` types. Defaults to ``False``. .. caution:: This parameter is only applied if ``convert_datetimes`` is set to ``True``. :type dates_as_datetime64: :class:`bool <python:bool>` :returns: A :class:`DataFrame <pandas:DataFrame>` representation of the SPSS data (or :obj:`None <python:None>`) and a :class:`Metadata` representation of the dataset's metadata / data map. :rtype: :class:`pandas.DataFrame <pandas:DataFrame>`/:obj:`None <python:None>` and :class:`Metadata` """ if not any([ checkers.is_file(data), checkers.is_bytesIO(data), checkers.is_type(data, bytes) ]): raise errors.InvalidDataFormatError( 'data must be a filename, BytesIO, or bytes ' f'object. Was: {data.__class__.__name__}') limit = validators.integer(limit, allow_empty=True, minimum=0) offset = validators.integer(offset, minimum=0) exclude_variables = validators.iterable(exclude_variables, allow_empty=True) if exclude_variables: exclude_variables = [validators.string(x) for x in exclude_variables] include_variables = validators.iterable(include_variables, allow_empty=True) if include_variables: include_variables = [validators.string(x) for x in include_variables] if not checkers.is_file(data): with tempfile.NamedTemporaryFile(delete=False) as temp_file: temp_file.write(data) temp_file_name = temp_file.name df, meta = pyreadstat.read_sav( temp_file_name, metadataonly=metadata_only, dates_as_pandas_datetime=dates_as_datetime64, apply_value_formats=apply_labels, formats_as_category=labels_as_categories, usecols=include_variables, user_missing=not missing_as_NaN, disable_datetime_conversion=not convert_datetimes, row_limit=limit or 0, row_offset=offset, **kwargs) os.remove(temp_file_name) else: df, meta = pyreadstat.read_sav( data, metadataonly=metadata_only, dates_as_pandas_datetime=dates_as_datetime64, apply_value_formats=apply_labels, formats_as_category=labels_as_categories, usecols=include_variables, user_missing=not missing_as_NaN, disable_datetime_conversion=not convert_datetimes, row_limit=limit or 0, row_offset=offset, **kwargs) metadata = Metadata.from_pyreadstat(meta) if exclude_variables: df = df.drop(exclude_variables, axis=1) if metadata.column_metadata: for variable in exclude_variables: metadata.column_metadata.pop(variable, None) return df, metadata
def test_generate_model_from_yaml(input_files, input_data, tablename, primary_key, serialization_config, skip_nested, default_to_str, type_mapping, base_model_attrs, expected_types, error): # pylint: disable=no-member,line-too-long input_data = check_input_file(input_files, input_data) if not checkers.is_file(input_data): input_data = yaml.dump(input_data) if error: with pytest.raises(error): result = generate_model_from_yaml(input_data, tablename = tablename, primary_key = primary_key, serialization_config = serialization_config, skip_nested = skip_nested, default_to_str = default_to_str, type_mapping = type_mapping, base_model_attrs = base_model_attrs) else: result = generate_model_from_yaml(input_data, tablename = tablename, primary_key = primary_key, serialization_config = serialization_config, skip_nested = skip_nested, default_to_str = default_to_str, type_mapping = type_mapping, base_model_attrs = base_model_attrs) assert hasattr(result, 'to_json') is True assert hasattr(result, 'new_from_json') is True assert hasattr(result, 'update_from_json') is True assert hasattr(result, '__serialization__') is True assert result.__tablename__ == tablename for item in expected_types: assert hasattr(result, item[0]) is True attribute = getattr(result, item[0], None) assert isinstance(attribute.type, item[1]) is True if serialization_config: for item in serialization_config: assert hasattr(result, item.name) is True assert result.get_attribute_serialization_config(item.name) == item else: for item in expected_types: assert hasattr(result, item[0]) is True assert result.get_attribute_serialization_config(item[0]).supports_csv == (True, True) assert result.get_attribute_serialization_config(item[0]).supports_json == (True, True) assert result.get_attribute_serialization_config(item[0]).supports_yaml == (True, True) assert result.get_attribute_serialization_config(item[0]).supports_dict == (True, True) if base_model_attrs: for key in base_model_attrs: assert hasattr(result, key) is True assert getattr(result, key) == base_model_attrs[key]