def test_parse_with_custom_parser(self): """ Parses a subclass of DataFrame with a custom converter. :return: """ from typing import Type from parsyfiles.converting_core import T from logging import Logger from xml.etree.ElementTree import ElementTree, parse, tostring def read_xml(desired_type: Type[T], file_path: str, encoding: str, logger: Logger, **kwargs): """ Opens an XML file and returns the tree parsed from it as an ElementTree. :param desired_type: :param file_path: :param encoding: :param logger: :param kwargs: :return: """ return parse(file_path) my_parser = SingleFileParserFunction(parser_function=read_xml, streaming_mode=False, supported_exts={'.xml'}, supported_types={ElementTree}) parser = RootParser('parsyfiles with timeseries') parser.register_parser(my_parser) xmls = parser.parse_collection(fix_path('./xml_collection'), ElementTree) pprint({name: tostring(x.getroot()) for name, x in xmls.items()})
def test_union_recursive_1(root_parser: RootParser): """ Tests that you can parse infinitely-nested dictionaries from a folder using forward references """ class A: def __init__(self, foo: str): self.foo = foo # First (preferred) way InfiniteRecursiveDictOfA = Dict[str, Union[A, 'InfiniteRecursiveDictOfA']] items = root_parser.parse_item(get_path('test2'), InfiniteRecursiveDictOfA) assert type(items['a']['a']['a']) == A assert type(items['a']['a']['b']) == A assert type(items['a']['b']) == A assert type(items['b']) == A # Less preferred way, but check that it works too InfiniteRecursiveDictOfA2 = Union[A, Dict[str, 'InfiniteRecursiveDictOfA2']] items = root_parser.parse_collection(get_path('test2'), InfiniteRecursiveDictOfA2) assert type(items['a']['a']['a']) == A assert type(items['a']['a']['b']) == A assert type(items['a']['b']) == A assert type(items['b']) == A # This is a forward reference that is equivalent to 'A'. # It should be handled correctly by parsyfiles so as not to lead to infinite recursiong InfiniteRecursiveDictOfA3 = Union[A, 'InfiniteRecursiveDictOfA3'] item = root_parser.parse_item(get_path('test2', 'b'), InfiniteRecursiveDictOfA3) assert type(item) == A
def test_print_and_get_capabilities_by_type(root_parser: RootParser): """ Tests that the declared capabilities by type are correct """ c = root_parser.get_capabilities_by_type(strict_type_matching=False) print('\n' + str(len(c)) + ' Root parser capabilities by type:') assert len(c) == 15 cdict = to_str_coll(c) # dump(cdict, 'reference_capabilities_by_type.json') assert cdict == load('reference_capabilities_by_type.json') root_parser.print_capabilities_by_type(strict_type_matching=False)
def test_root_parser_any(): """ Tests that we can ask the rootparser for its capabilities to parse a given type :return: """ root_parser = RootParser() # print root_parser.print_capabilities_for_type(typ=Any) # details res = root_parser.find_all_matching_parsers(strict=False, desired_type=AnyObject, required_ext='.cfg') match_generic, match_approx, match_exact = res[0] assert len(match_generic) == 0 assert len(match_approx) == 0
def test_get_all_supported_types_pretty_str(root_parser: RootParser): """ Tests that the declared supported types are there and that their number is correct """ t = root_parser.get_all_supported_types_pretty_str() print('\n' + str(len(t)) + ' Root parser supported types:') pprint(t) assert len(t) == 15 # dump(list(t), 'reference_supported_types.json') assert t == set(load('reference_supported_types.json'))
def test_get_all_supported_exts(root_parser: RootParser): """ Tests that the declared supported extensions are there and that their number is correct """ e = root_parser.get_all_supported_exts() print('\n' + str(len(e)) + ' Root parser supported extensions:') pprint(e) assert len(e) == 13 # dump(list(e), 'reference_supported_exts.json') assert e == set(load('reference_supported_exts.json'))
def test_get_all_parsers(root_parser: RootParser): """ Tests that the default parsers are there and that their number is correct """ parsers = root_parser.get_all_parsers(strict_type_matching=False) print('\n' + str(len(parsers)) + ' Root parser parsers:') pprint(parsers) assert len(parsers) == 127 parsers_str = to_str_coll(parsers) # dump(parsers_str, 'reference_parsers.json') assert parsers_str == load('reference_parsers.json')
def test_pass_parser_options(self): """ Passes options to the pandas parser :return: """ from pandas import DataFrame from parsyfiles import RootParser # create a root parser parser = RootParser() # retrieve the parsers of interest parsers = parser.get_capabilities_for_type(DataFrame, strict_type_matching=False) df_csv_parser = parsers['.csv']['1_exact_match'][0] p_id_csv = df_csv_parser.get_id_for_options() print('Parser id for csv is : ' + p_id_csv + ', implementing function is ' + repr(df_csv_parser._parser_func)) print('option hints : ' + df_csv_parser.options_hints()) df_xls_parser = parsers['.xls']['1_exact_match'][0] p_id_xls = df_xls_parser.get_id_for_options() print('Parser id for csv is : ' + p_id_xls + ', implementing function is ' + repr(df_xls_parser._parser_func)) print('option hints : ' + df_xls_parser.options_hints()) from parsyfiles import create_parser_options, add_parser_options # configure the DataFrame parsers to automatically parse dates and use the first column as index opts = create_parser_options() opts = add_parser_options(opts, 'read_df_or_series_from_csv', { 'parse_dates': True, 'index_col': 0 }) opts = add_parser_options(opts, 'read_dataframe_from_xls', {'index_col': 0}) dfs = parser.parse_collection( fix_path('./test_data/demo/ts_collection'), DataFrame, options=opts) print(dfs)
def test_union_1(root_parser: RootParser): """ Tests that parsing a Union works """ class A: def __init__(self, foo: str): self.foo = foo class B: def __init__(self, bar: float): self.bar = bar item = root_parser.parse_item(get_path('test1', 'a'), Union[A, B]) assert type(item) == A
def test_simple_collection(self): """ Parsing a collection of dataframes as a dictionary :return: """ from pandas import DataFrame dfs = parse_collection(fix_path('./simple_collection'), DataFrame) pprint(dfs) df = parse_item(fix_path('./simple_collection/c'), DataFrame) pprint(df) RootParser().print_capabilities_for_type(typ=DataFrame)
def test_typevars_3(root_parser: RootParser): """ Tests that a TypeVar with 'constraints' may be used as a desired Type -> it will be a Union """ class A: def __init__(self, foo: str): self.foo = foo class B: def __init__(self, bar: float): self.bar = bar TV = TypeVar('TV', A, B) item = root_parser.parse_item(get_path('test2', 'a'), TV) assert type(item) == A item = root_parser.parse_item(get_path('test2', 'b'), TV) assert type(item) == B items = root_parser.parse_collection(get_path('test2'), TV) assert len(items) == 2 assert type(items['a']) == A assert type(items['b']) == B
def test_union_2(root_parser: RootParser): """ Tests that parsing a collection of Union works """ class A: def __init__(self, foo: str): self.foo = foo class B: def __init__(self, bar: float): self.bar = bar items = root_parser.parse_collection(get_path('test1'), Union[A, B]) assert len(items) == 2 assert type(items['a']) == A assert type(items['b']) == B
def test_typevars_2(root_parser: RootParser): """ Tests that a TypeVar with 'bound' may be used as a desired Type directly -> it will be replaced with the bound type """ class A: def __init__(self, foo: str): self.foo = foo class B(A): def __init__(self, bar: float): super(B, self).__init__(foo=str(bar)) TV = TypeVar('TV', bound=A) item = root_parser.parse_item(get_path('test2', 'a'), TV) assert type(item) == A item = root_parser.parse_item(get_path('test2', 'b'), TV) assert type(item) == B items = root_parser.parse_collection(get_path('test2'), TV) assert len(items) == 2 assert type(items['a']) == A assert type(items['b']) == B
def test_simple_objects(self): """ Parsing a collection of simple objects_data :return: """ # First define the function that we want to test # (not useful, but just to show a complete story in the readme...) def exec_op(x: float, y: float, op: str) -> float: if op is '+': return x + y elif op is '-': return x - y else: raise ValueError('Unsupported operation : \'' + op + '\'') # Then define the simple class representing your test case class ExecOpTest(object): def __init__(self, x: float, y: float, op: str, expected_result: float): self.x = x self.y = y self.op = op self.expected_result = expected_result def __str__(self): return self.__repr__() def __repr__(self): return str(self.x) + ' ' + self.op + ' ' + str( self.y) + ' =? ' + str(self.expected_result) # Create the parser and parse a single file # e = parse_item('./test_data/objects_data/test_diff_1', ExecOpTest) # pprint(e) # parse all of them as dicts sf_tests_dct = parse_collection(fix_path('./simple_objects'), Dict) # assert that they are sorted assert list(sf_tests_dct.keys()) == list(sorted(sf_tests_dct.keys())) # parse all of them as objects_data sf_tests = parse_collection(fix_path('./simple_objects'), ExecOpTest) pprint(sf_tests) # RootParser().print_capabilities_for_type(typ=ExecOpTest)
def test_parse_subtypes(root_parser: RootParser): """ Tests that subclasses can be parsed """ class A: pass class B(A): def __init__(self, foo: str): self.foo = foo class C(B): def __init__(self, bar: str): super(C, self).__init__(foo=bar) items = root_parser.parse_collection(get_path('test2'), A) assert type(items['b']) == B assert type(items['c']) == C
def test_custom_parser_ok_for_subclasses(): """ Tests that if you register a custom parser for a subclass of A, it gets correctly used to parse A (in non-strict mode, which is the default) :return: """ root_parser = RootParser() class A: def __init__(self, txt): self.txt = txt class B(A): """ a subclass of A """ pass def read_B_from_txt(desired_type: Type[dict], file_object: TextIOBase, logger: Logger, *args, **kwargs) -> str: # read the entire stream into a string str_io = StringIO() shutil.copyfileobj(file_object, str_io) # only return the first character return B(str_io.getvalue()[0]) # before registering a parser for B, only generic parsers are able to parse a A before_capa = root_parser.get_capabilities_for_type(A)['.txt'] assert list(before_capa.keys()) == ['3_generic'] # register a parser for B root_parser.register_parser( SingleFileParserFunction(parser_function=read_B_from_txt, streaming_mode=True, supported_exts={'.txt'}, supported_types={B})) # after registering the new parser appears in the list able to parse A after_capa = root_parser.get_capabilities_for_type(A)['.txt'] assert str(after_capa['2_approx_match'][0]) == '<read_B_from_txt>' a = root_parser.parse_item(get_path('b64pickle-float-1.0=True'), A) # check that the custom parser was used, not the generic 'construct from string' assert len(a.txt) == 1 assert a.txt == 'g'
def test_typevars_1(root_parser: RootParser): """ Tests that a constructor containing TypeVars is correctly handled """ class A: def __init__(self, foo: str): self.foo = foo class B(A): def __init__(self, bar: float): super(B, self).__init__(foo=bar) TV = TypeVar('TV', bound=A) class Test(Generic[TV]): def __init__(self, obj: TV): self.obj = obj items = root_parser.parse_collection(get_path('test1'), Test) assert len(items) == 2 assert type(items['a'].obj) == A assert type(items['b'].obj) == B
def test_get_all_conversion_chains(root_parser: RootParser): """ Tests that the default conversion chains are there and that their number is correct """ chains = root_parser.get_all_conversion_chains() print('\n' + str(len(chains[0])) + '(generic) + ' + str(len(chains[2])) + '(specific) Root parser converters:') pprint(chains) assert len(chains[0]) == 22 assert len(chains[1]) == 0 assert len(chains[2]) == 200 generic_chains_str = to_str_coll(chains[0]) specific_chains_str = to_str_coll(chains[2]) # dump(generic_chains_str, 'reference_generic_conversion_chains.json') assert generic_chains_str == load( 'reference_generic_conversion_chains.json') # dump(specific_chains_str, 'reference_specific_conversion_chains.json') assert specific_chains_str == load( 'reference_specific_conversion_chains.json')
def test_multifile_objects(self): """ Parsing a list of multifile objects_data :return: """ from pandas import Series, DataFrame class AlgoConf(object): def __init__(self, foo_param: str, bar_param: int): self.foo_param = foo_param self.bar_param = bar_param class AlgoResults(object): def __init__(self, score: float, perf: float): self.score = score self.perf = perf def exec_op_series(x: Series, y: AlgoConf) -> AlgoResults: pass class ExecOpSeriesTest(object): def __init__(self, x: Series, y: AlgoConf, expected_results: AlgoResults): self.x = x self.y = y self.expected_results = expected_results # parse all of them mf_tests = parse_collection(fix_path('./complex_objects'), ExecOpSeriesTest) pprint(mf_tests) RootParser().print_capabilities_for_type(typ=ExecOpSeriesTest) from parsyfiles import FlatFileMappingConfiguration dfs = parse_collection( fix_path('./complex_objects_flat'), DataFrame, file_mapping_conf=FlatFileMappingConfiguration()) pprint(dfs)
class AllTests(TestCase): def setUp(self): """ Creates the root parser to be used in most tests :return: """ self.root_parser = RootParser() def test_a_root_parser_capabilities(self): """ Tests that we can print the capabilities of the root parser: registered parsers and converters, supported extensions and types, etc. :return: """ p = self.root_parser.get_all_parsers(strict_type_matching=False) print('\n' + str(len(p)) + ' Root parser parsers:') pprint(p) print('Testing option hints for parsing chain') print(p[0].options_hints()) c = self.root_parser.get_all_conversion_chains() print('\n' + str(len(c[0]) + len(c[2])) + ' Root parser converters:') pprint(c) e = self.root_parser.get_all_supported_exts() print('\n' + str(len(e)) + ' Root parser supported extensions:') pprint(e) t = self.root_parser.get_all_supported_types_pretty_str() print('\n' + str(len(t)) + ' Root parser supported types:') pprint(t) print('\nRoot parser parsers by extensions:') self.root_parser.print_capabilities_by_ext(strict_type_matching=False) print('\nRoot parser parsers by types:') self.root_parser.print_capabilities_by_type(strict_type_matching=False) return def test_b_root_parser_any(self): """ Tests that we can ask the rootparser for its capabilities to parse a given type :return: """ # print self.root_parser.print_capabilities_for_type(typ=Any) # details res = self.root_parser.find_all_matching_parsers( strict=False, desired_type=AnyObject, required_ext='.cfg') match_generic, match_approx, match_exact = res[0] self.assertEquals(len(match_generic), 0) self.assertEquals(len(match_approx), 0) def test_objects_support(self): """ Tests all the supported ways to parse a simple object :return: """ # Then define the simple class representing your test case class ExecOpTest(object): def __init__(self, x: float, y: float, op: str, expected_result: float): self.x = x self.y = y self.op = op self.expected_result = expected_result def __str__(self): return self.__repr__() def __repr__(self): return str(self.x) + ' ' + self.op + ' ' + str( self.y) + ' =? ' + str(self.expected_result) # create the parser and parse a single file e = parse_item(fix_path('./test_data/objects/test_diff_1'), ExecOpTest) pprint(e) # parse all of them e = parse_collection(fix_path('./test_data/objects'), ExecOpTest) pprint(e) def test_collections(self): """ Tests all the supported ways to parse collections :return: """ l = parse_item( fix_path('./test_data/collections'), Tuple[Dict[str, int], List[int], Set[int], Tuple[str, int, str]]) print(l)
def test_parse_subclass_of_known_with_custom_converter(self): """ Parses a subclass of DataFrame with a custom converter. :return: """ # define your class from pandas import DataFrame, DatetimeIndex class TimeSeries(DataFrame): """ A basic timeseries class that extends DataFrame """ def __init__(self, df: DataFrame): """ Constructor from a DataFrame. The DataFrame index should be an instance of DatetimeIndex :param df: """ if isinstance(df, DataFrame) and isinstance( df.index, DatetimeIndex): if df.index.tz is None: df.index = df.index.tz_localize( tz='UTC' ) # use the UTC hypothesis in absence of other hints self._df = df else: raise ValueError( 'Error creating TimeSeries from DataFrame: provided DataFrame does not have a ' 'valid DatetimeIndex') def __getattr__(self, item): # Redirects anything that is not implemented here to the base dataframe. # this is called only if the attribute was not found the usual way # easy version of the dynamic proxy just to save time :) # see http://code.activestate.com/recipes/496741-object-proxying/ for "the answer" df = object.__getattribute__(self, '_df') if hasattr(df, item): return getattr(df, item) else: raise AttributeError('\'' + self.__class__.__name__ + '\' object has no attribute \'' + item + '\'') def update(self, other, join='left', overwrite=True, filter_func=None, raise_conflict=False): """ For some reason this method was abstract in DataFrame so we have to implement it """ return self._df.update(other, join=join, overwrite=overwrite, filter_func=filter_func, raise_conflict=raise_conflict) # -- create your converter from typing import Type from logging import Logger from parsyfiles.converting_core import ConverterFunction def df_to_ts(desired_type: Type[TimeSeries], df: DataFrame, logger: Logger) -> TimeSeries: """ Converter from DataFrame to TimeSeries """ return TimeSeries(df) my_converter = ConverterFunction(from_type=DataFrame, to_type=TimeSeries, conversion_method=df_to_ts) # -- create a parser and register your converter from parsyfiles import RootParser, create_parser_options, add_parser_options parser = RootParser('parsyfiles with timeseries') parser.register_converter(my_converter) # -- you might wish to configure the DataFrame parser, though: opts = create_parser_options() opts = add_parser_options(opts, 'read_df_or_series_from_csv', { 'parse_dates': True, 'index_col': 0 }) opts = add_parser_options(opts, 'read_dataframe_from_xls', {'index_col': 0}) dfs = parser.parse_collection(fix_path('./ts_collection'), TimeSeries, options=opts)
def parse_with_new_instance(): rp = RootParser() result = rp.parse_item( os.path.join(THIS_DIR, 'test_data/b64pickle-float-1.0=True'), bool) assert result == True
def root_parser(): l = getLogger('parsyfiles') l.setLevel(DEBUG) return RootParser()
def setUp(self): """ Creates the root parser to be used in most tests :return: """ self.root_parser = RootParser()
def test_option_hints(root_parser: RootParser): """ Tests the option_hints method on the first parser available """ print('Testing option hints for parsing chain') p = root_parser.get_all_parsers(strict_type_matching=False) print(p[0].options_hints())