def test_parse_with_custom_parser(self): """ Parses a subclass of DataFrame with a custom converter. :return: """ from typing import Type from parsyfiles.converting_core import T from logging import Logger from xml.etree.ElementTree import ElementTree, parse, tostring def read_xml(desired_type: Type[T], file_path: str, encoding: str, logger: Logger, **kwargs): """ Opens an XML file and returns the tree parsed from it as an ElementTree. :param desired_type: :param file_path: :param encoding: :param logger: :param kwargs: :return: """ return parse(file_path) my_parser = SingleFileParserFunction(parser_function=read_xml, streaming_mode=False, supported_exts={'.xml'}, supported_types={ElementTree}) parser = RootParser('parsyfiles with timeseries') parser.register_parser(my_parser) xmls = parser.parse_collection(fix_path('./xml_collection'), ElementTree) pprint({name: tostring(x.getroot()) for name, x in xmls.items()})
def test_union_recursive_1(root_parser: RootParser): """ Tests that you can parse infinitely-nested dictionaries from a folder using forward references """ class A: def __init__(self, foo: str): self.foo = foo # First (preferred) way InfiniteRecursiveDictOfA = Dict[str, Union[A, 'InfiniteRecursiveDictOfA']] items = root_parser.parse_item(get_path('test2'), InfiniteRecursiveDictOfA) assert type(items['a']['a']['a']) == A assert type(items['a']['a']['b']) == A assert type(items['a']['b']) == A assert type(items['b']) == A # Less preferred way, but check that it works too InfiniteRecursiveDictOfA2 = Union[A, Dict[str, 'InfiniteRecursiveDictOfA2']] items = root_parser.parse_collection(get_path('test2'), InfiniteRecursiveDictOfA2) assert type(items['a']['a']['a']) == A assert type(items['a']['a']['b']) == A assert type(items['a']['b']) == A assert type(items['b']) == A # This is a forward reference that is equivalent to 'A'. # It should be handled correctly by parsyfiles so as not to lead to infinite recursiong InfiniteRecursiveDictOfA3 = Union[A, 'InfiniteRecursiveDictOfA3'] item = root_parser.parse_item(get_path('test2', 'b'), InfiniteRecursiveDictOfA3) assert type(item) == A
def test_union_2(root_parser: RootParser): """ Tests that parsing a collection of Union works """ class A: def __init__(self, foo: str): self.foo = foo class B: def __init__(self, bar: float): self.bar = bar items = root_parser.parse_collection(get_path('test1'), Union[A, B]) assert len(items) == 2 assert type(items['a']) == A assert type(items['b']) == B
def test_parse_subtypes(root_parser: RootParser): """ Tests that subclasses can be parsed """ class A: pass class B(A): def __init__(self, foo: str): self.foo = foo class C(B): def __init__(self, bar: str): super(C, self).__init__(foo=bar) items = root_parser.parse_collection(get_path('test2'), A) assert type(items['b']) == B assert type(items['c']) == C
def test_pass_parser_options(self): """ Passes options to the pandas parser :return: """ from pandas import DataFrame from parsyfiles import RootParser # create a root parser parser = RootParser() # retrieve the parsers of interest parsers = parser.get_capabilities_for_type(DataFrame, strict_type_matching=False) df_csv_parser = parsers['.csv']['1_exact_match'][0] p_id_csv = df_csv_parser.get_id_for_options() print('Parser id for csv is : ' + p_id_csv + ', implementing function is ' + repr(df_csv_parser._parser_func)) print('option hints : ' + df_csv_parser.options_hints()) df_xls_parser = parsers['.xls']['1_exact_match'][0] p_id_xls = df_xls_parser.get_id_for_options() print('Parser id for csv is : ' + p_id_xls + ', implementing function is ' + repr(df_xls_parser._parser_func)) print('option hints : ' + df_xls_parser.options_hints()) from parsyfiles import create_parser_options, add_parser_options # configure the DataFrame parsers to automatically parse dates and use the first column as index opts = create_parser_options() opts = add_parser_options(opts, 'read_df_or_series_from_csv', { 'parse_dates': True, 'index_col': 0 }) opts = add_parser_options(opts, 'read_dataframe_from_xls', {'index_col': 0}) dfs = parser.parse_collection( fix_path('./test_data/demo/ts_collection'), DataFrame, options=opts) print(dfs)
def test_typevars_1(root_parser: RootParser): """ Tests that a constructor containing TypeVars is correctly handled """ class A: def __init__(self, foo: str): self.foo = foo class B(A): def __init__(self, bar: float): super(B, self).__init__(foo=bar) TV = TypeVar('TV', bound=A) class Test(Generic[TV]): def __init__(self, obj: TV): self.obj = obj items = root_parser.parse_collection(get_path('test1'), Test) assert len(items) == 2 assert type(items['a'].obj) == A assert type(items['b'].obj) == B
def test_typevars_3(root_parser: RootParser): """ Tests that a TypeVar with 'constraints' may be used as a desired Type -> it will be a Union """ class A: def __init__(self, foo: str): self.foo = foo class B: def __init__(self, bar: float): self.bar = bar TV = TypeVar('TV', A, B) item = root_parser.parse_item(get_path('test2', 'a'), TV) assert type(item) == A item = root_parser.parse_item(get_path('test2', 'b'), TV) assert type(item) == B items = root_parser.parse_collection(get_path('test2'), TV) assert len(items) == 2 assert type(items['a']) == A assert type(items['b']) == B
def test_typevars_2(root_parser: RootParser): """ Tests that a TypeVar with 'bound' may be used as a desired Type directly -> it will be replaced with the bound type """ class A: def __init__(self, foo: str): self.foo = foo class B(A): def __init__(self, bar: float): super(B, self).__init__(foo=str(bar)) TV = TypeVar('TV', bound=A) item = root_parser.parse_item(get_path('test2', 'a'), TV) assert type(item) == A item = root_parser.parse_item(get_path('test2', 'b'), TV) assert type(item) == B items = root_parser.parse_collection(get_path('test2'), TV) assert len(items) == 2 assert type(items['a']) == A assert type(items['b']) == B
def test_parse_subclass_of_known_with_custom_converter(self): """ Parses a subclass of DataFrame with a custom converter. :return: """ # define your class from pandas import DataFrame, DatetimeIndex class TimeSeries(DataFrame): """ A basic timeseries class that extends DataFrame """ def __init__(self, df: DataFrame): """ Constructor from a DataFrame. The DataFrame index should be an instance of DatetimeIndex :param df: """ if isinstance(df, DataFrame) and isinstance( df.index, DatetimeIndex): if df.index.tz is None: df.index = df.index.tz_localize( tz='UTC' ) # use the UTC hypothesis in absence of other hints self._df = df else: raise ValueError( 'Error creating TimeSeries from DataFrame: provided DataFrame does not have a ' 'valid DatetimeIndex') def __getattr__(self, item): # Redirects anything that is not implemented here to the base dataframe. # this is called only if the attribute was not found the usual way # easy version of the dynamic proxy just to save time :) # see http://code.activestate.com/recipes/496741-object-proxying/ for "the answer" df = object.__getattribute__(self, '_df') if hasattr(df, item): return getattr(df, item) else: raise AttributeError('\'' + self.__class__.__name__ + '\' object has no attribute \'' + item + '\'') def update(self, other, join='left', overwrite=True, filter_func=None, raise_conflict=False): """ For some reason this method was abstract in DataFrame so we have to implement it """ return self._df.update(other, join=join, overwrite=overwrite, filter_func=filter_func, raise_conflict=raise_conflict) # -- create your converter from typing import Type from logging import Logger from parsyfiles.converting_core import ConverterFunction def df_to_ts(desired_type: Type[TimeSeries], df: DataFrame, logger: Logger) -> TimeSeries: """ Converter from DataFrame to TimeSeries """ return TimeSeries(df) my_converter = ConverterFunction(from_type=DataFrame, to_type=TimeSeries, conversion_method=df_to_ts) # -- create a parser and register your converter from parsyfiles import RootParser, create_parser_options, add_parser_options parser = RootParser('parsyfiles with timeseries') parser.register_converter(my_converter) # -- you might wish to configure the DataFrame parser, though: opts = create_parser_options() opts = add_parser_options(opts, 'read_df_or_series_from_csv', { 'parse_dates': True, 'index_col': 0 }) opts = add_parser_options(opts, 'read_dataframe_from_xls', {'index_col': 0}) dfs = parser.parse_collection(fix_path('./ts_collection'), TimeSeries, options=opts)