def test_parse_ids_subsheet(self): parser = JSONParser( root_json_dict=[ OrderedDict( [ ("custom", 1), ("id", 2), ( "testnest", [ OrderedDict( [ ("id", 3), ("a", "b"), ( "c", [ OrderedDict([("d", "e")]), OrderedDict([("d", "e2")]), ], ), ( "f", {"g": "h"}, ), # Check that having nested objects doesn't break ID output ] ) ], ), ] ) ], root_id="custom", ) parser.parse() assert list(parser.main_sheet) == ["custom", "id"] assert parser.main_sheet.lines == [{"custom": 1, "id": 2,}] assert listify(parser.sub_sheets) == { "testnest": [ "custom", "id", "testnest/0/id", "testnest/0/a", "testnest/0/f/g", ], "tes_c": ["custom", "id", "testnest/0/id", "testnest/0/c/0/d"], } assert parser.sub_sheets["testnest"].lines == [ { "custom": 1, "id": 2, "testnest/0/id": 3, "testnest/0/a": "b", "testnest/0/f/g": "h", }, ] assert parser.sub_sheets["tes_c"].lines == [ {"custom": 1, "id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e"}, {"custom": 1, "id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e2"}, ]
def test_rollup(self): schema_parser = SchemaParser(root_schema_dict={ 'properties': { 'testA': { 'type': 'array', 'rollUp': [ 'testB' ], 'items': { 'type': 'object', 'properties': { 'testB': {'type': 'string'}, 'testC': {'type': 'string'} } } }, } }, rollup=True) schema_parser.parse() parser = JSONParser( root_json_dict=[OrderedDict([ ('testA', [OrderedDict([('testB', '1'), ('testC', '2')])]), ])], schema_parser=schema_parser ) parser.parse() assert list(parser.main_sheet) == [ 'testA[]/testB' ] assert parser.main_sheet.lines == [ {'testA[]/testB': '1'} ] assert len(parser.sub_sheets) == 1 assert set(parser.sub_sheets['testA']) == set(['ocid', 'testB', 'testC']) assert parser.sub_sheets['testA'].lines == [{'testB':'1', 'testC': '2'}]
def test_parse_ids(self): parser = JSONParser( root_json_dict=[ OrderedDict( [ ("id", 2), ("a", "b"), ( "c", [ OrderedDict([("id", 3), ("d", "e")]), OrderedDict([("id", 3), ("d", "e2")]), ], ), ( "f", {"g": "h"}, ), # Check that having nested objects doesn't break ID output ] ) ], root_id="", ) parser.parse() assert list(parser.main_sheet) == ["id", "a", "f/g"] assert parser.main_sheet.lines == [{"id": 2, "a": "b", "f/g": "h"}] assert listify(parser.sub_sheets) == {"c": ["id", "c/0/id", "c/0/d"]} assert parser.sub_sheets["c"].lines == [ {"id": 2, "c/0/id": 3, "c/0/d": "e"}, {"id": 2, "c/0/id": 3, "c/0/d": "e2"}, ]
def test_column_matching(self, tmpdir): test_schema = tmpdir.join('test.json') test_schema.write('''{ "properties": { "c": { "type": "array", "items": {"type": "string"} } } }''') schema_parser = SchemaParser( schema_filename=test_schema.strpath ) schema_parser.parse() parser = JSONParser( root_json_dict=[OrderedDict([ ('c', ['d']), ])], schema_parser=schema_parser ) parser.parse() assert list(parser.main_sheet) == [ 'c:array' ] assert parser.main_sheet.lines == [ {'c:array': 'd'} ] assert len(parser.sub_sheets) == 0
def test_parse_array(): parser = JSONParser( root_json_dict=[OrderedDict([('testarray', ['item', 'anotheritem'])])]) parser.parse() assert list(parser.main_sheet) == ['testarray'] assert parser.main_sheet.lines == [{'testarray': 'item;anotheritem'}] assert parser.sub_sheets == {}
def test_parse_ids_nested(self): parser = JSONParser(root_json_dict=[OrderedDict([ ('id', 2), ('a', 'b'), ('testnest', OrderedDict([ ('id', 3), ('c', [OrderedDict([('d', 'e')]), OrderedDict([('d', 'e2')])]) ])), ('f', {'g':'h'}) # Check that having nested objects doesn't break ID output ])], root_id='') parser.parse() assert list(parser.main_sheet) == [ 'id', 'a', 'testnest/id', 'f/g' ] assert parser.main_sheet.lines == [ { 'id': 2, 'a': 'b', 'testnest/id': 3, 'f/g': 'h' } ] assert listify(parser.sub_sheets) == {'c': ['main/id','main/testnest/id','d']} assert parser.sub_sheets['c'].lines == [ { 'main/id': 2, 'main/testnest/id': 3, 'd':'e' }, { 'main/id': 2, 'main/testnest/id': 3, 'd':'e2' }, ]
def test_sub_sheet_names(self, tmpdir): test_schema = tmpdir.join('test.json') test_schema.write('''{ "properties": { "c": { "type": "array", "items": {"$ref": "#/testB"} } }, "testB": { "type": "object", "properties": { "d": { "type": "string" }, "f": { "type": "string" } } } }''') schema_parser = SchemaParser(schema_filename=test_schema.strpath) schema_parser.parse() parser = JSONParser(root_json_dict=[ OrderedDict([ ('a', 'b'), ('c', [OrderedDict([('d', 'e')])]), ]) ], schema_parser=schema_parser) parser.parse() assert list(parser.main_sheet) == ['a'] assert parser.main_sheet.lines == [{'a': 'b'}] assert len(parser.sub_sheets) == 1 assert list(parser.sub_sheets['testB']) == list(['ocid', 'd', 'f']) assert parser.sub_sheets['testB'].lines == [{'d': 'e'}]
def test_flatten_multiplesheets(use_titles, use_schema, root_id, root_id_kwargs, input_list, expected_output_dict, recwarn, comment, warning_messages, tmpdir, reversible): # Not sure why, but this seems to be necessary to have warnings picked up # on Python 2.7 and 3.3, but 3.4 and 3.5 are fine without it import warnings warnings.simplefilter('always') extra_kwargs = {'use_titles': use_titles} extra_kwargs.update(root_id_kwargs) if use_schema: schema_parser = SchemaParser( root_schema_dict=create_schema(root_id) if use_schema else {"properties": {}}, rollup=True, **extra_kwargs ) schema_parser.parse() else: schema_parser = None with tmpdir.join('input.json').open('w') as fp: json.dump({ 'mykey': [inject_root_id(root_id, input_row) for input_row in input_list] }, fp) parser = JSONParser( json_filename=tmpdir.join('input.json').strpath, root_list_path='mykey', schema_parser=schema_parser, **extra_kwargs) parser.parse() expected_output_dict = OrderedDict([(sheet_name, [inject_root_id(root_id, line) for line in lines]) for sheet_name, lines in expected_output_dict.items()]) output = {sheet_name:sheet.lines for sheet_name, sheet in parser.sub_sheets.items() if sheet.lines} output['custom_main'] = parser.main_sheet.lines assert output == expected_output_dict
def test_parse_ids(self): parser = JSONParser( root_json_dict=[ OrderedDict([ ('id', 2), ('a', 'b'), ('c', [ OrderedDict([('id', 3), ('d', 'e')]), OrderedDict([('id', 3), ('d', 'e2')]) ]), ( 'f', { 'g': 'h' } ) # Check that having nested objects doesn't break ID output ]) ], root_id='') parser.parse() assert list(parser.main_sheet) == ['id', 'a', 'f/g'] assert parser.main_sheet.lines == [{'id': 2, 'a': 'b', 'f/g': 'h'}] assert listify(parser.sub_sheets) == {'c': ['id', 'c/0/id', 'c/0/d']} assert parser.sub_sheets['c'].lines == [ { 'id': 2, 'c/0/id': 3, 'c/0/d': 'e' }, { 'id': 2, 'c/0/id': 3, 'c/0/d': 'e2' }, ]
def test_xml_basic_example(): parser = JSONParser( json_filename='examples/iati/expected.xml', root_list_path='iati-activity', schema_parser=None, root_id='', xml=True, id_name='iati-identifier') parser.parse() assert list(parser.main_sheet) == ['iati-identifier', 'reporting-org/@ref', 'reporting-org/@type', 'reporting-org/narrative', 'title/narrative', 'description/narrative', 'participating-org/@ref', 'participating-org/@role', 'activity-status/@code', 'activity-date/@iso-date', 'activity-date/@type'] assert parser.main_sheet.lines == [ {'activity-date/@type': '1', 'reporting-org/narrative': 'Organisation name', 'participating-org/@ref': 'AA-AAA-123456789', 'title/narrative': 'A title', 'participating-org/@role': '1', 'reporting-org/@ref': 'AA-AAA-123456789', 'iati-identifier': 'AA-AAA-123456789-ABC123', 'reporting-org/@type': '40', 'description/narrative': 'A description', 'activity-date/@iso-date': '2011-10-01', 'activity-status/@code': '2'}, {'activity-date/@type': '2', 'reporting-org/narrative': 'Organisation name', 'participating-org/@ref': 'AA-AAA-123456789', 'title/narrative': 'Another title', 'participating-org/@role': '1', 'reporting-org/@ref': 'AA-AAA-123456789', 'iati-identifier': 'AA-AAA-123456789-ABC124', 'reporting-org/@type': '40', 'description/narrative': 'Another description', 'activity-date/@iso-date': '2016-01-01', 'activity-status/@code': '3'} ] assert set(parser.sub_sheets.keys()) == set(['transaction', 'recipient-country']) assert list(parser.sub_sheets['transaction']) == ['iati-identifier', 'transaction/0/transaction-type/@code', 'transaction/0/transaction-date/@iso-date', 'transaction/0/value/@value-date', 'transaction/0/value'] assert parser.sub_sheets['transaction'].lines == [ {'transaction/0/value/@value-date': '2012-01-01', 'iati-identifier': 'AA-AAA-123456789-ABC123', 'transaction/0/transaction-date/@iso-date': '2012-01-01', 'transaction/0/value': '10', 'transaction/0/transaction-type/@code': '2'}, {'transaction/0/value/@value-date': '2012-03-03', 'iati-identifier': 'AA-AAA-123456789-ABC123', 'transaction/0/transaction-date/@iso-date': '2012-03-03', 'transaction/0/value': '20', 'transaction/0/transaction-type/@code': '3'}, {'transaction/0/value/@value-date': '2013-04-04', 'iati-identifier': 'AA-AAA-123456789-ABC124', 'transaction/0/transaction-date/@iso-date': '2013-04-04', 'transaction/0/value': '30', 'transaction/0/transaction-type/@code': '2'}, {'transaction/0/value/@value-date': '2013-05-05', 'iati-identifier': 'AA-AAA-123456789-ABC124', 'transaction/0/transaction-date/@iso-date': '2013-05-05', 'transaction/0/value': '40', 'transaction/0/transaction-type/@code': '3'} ] assert list(parser.sub_sheets['recipient-country']) == ['iati-identifier', 'recipient-country/0/@code', 'recipient-country/0/@percentage'] assert parser.sub_sheets['recipient-country'].lines == [ {'iati-identifier': 'AA-AAA-123456789-ABC123', 'recipient-country/0/@code': 'AF', 'recipient-country/0/@percentage': '40'}, {'iati-identifier': 'AA-AAA-123456789-ABC123', 'recipient-country/0/@code': 'XK', 'recipient-country/0/@percentage': '60'}, {'iati-identifier': 'AA-AAA-123456789-ABC124', 'recipient-country/0/@code': 'AG', 'recipient-country/0/@percentage': '30'}, {'iati-identifier': 'AA-AAA-123456789-ABC124', 'recipient-country/0/@code': 'XK', 'recipient-country/0/@percentage': '70'} ]
def test_jsonparser_bad_json(tmpdir): test_json = tmpdir.join("test.json") test_json.write('{"a":"b",}') with pytest.raises(BadlyFormedJSONError): JSONParser(json_filename=test_json.strpath) # matches against Python base error type with pytest.raises(ValueError): JSONParser(json_filename=test_json.strpath)
def test_parse_nested_dict_json_dict(): parser = JSONParser( root_json_dict=[OrderedDict([("a", "b"), ("c", OrderedDict([("d", "e")])),])] ) parser.parse() assert list(parser.main_sheet) == ["a", "c/d"] assert parser.main_sheet.lines == [{"a": "b", "c/d": "e"}] assert parser.sub_sheets == {}
def test_parse_array(): parser = JSONParser( root_json_dict=[OrderedDict([("testarray", ["item", "anotheritem", 42])])] ) parser.parse() assert list(parser.main_sheet) == ["testarray"] assert parser.main_sheet.lines == [{"testarray": "item;anotheritem;42"}] assert parser.sub_sheets == {}
def test_jsonparser_bad_json(tmpdir): test_json = tmpdir.join('test.json') test_json.write('{"a":"b",}') with pytest.raises(BadlyFormedJSONError): JSONParser(json_filename=test_json.strpath) # JSONInputValueError also matches against ValueError with pytest.raises(ValueError): JSONParser(json_filename=test_json.strpath)
def flatten(input_name, schema=None, output_name=None, output_format='all', main_sheet_name='main', root_list_path='main', root_is_list=False, sheet_prefix='', filter_field=None, filter_value=None, rollup=False, root_id=None, use_titles=False, xml=False, id_name='id', disable_local_refs=False, remove_empty_schema_columns=False, truncation_length=3, **_): """ Flatten a nested structure (JSON) to a flat structure (spreadsheet - csv or xlsx). """ if (filter_field is None and filter_value is not None) or (filter_field is not None and filter_value is None): raise Exception('You must use filter_field and filter_value together') if schema: schema_parser = SchemaParser( schema_filename=schema, rollup=rollup, root_id=root_id, use_titles=use_titles, disable_local_refs=disable_local_refs, truncation_length=truncation_length) schema_parser.parse() else: schema_parser = None parser = JSONParser( json_filename=input_name, root_list_path=None if root_is_list else root_list_path, schema_parser=schema_parser, root_id=root_id, use_titles=use_titles, xml=xml, id_name=id_name, filter_field=filter_field, filter_value=filter_value, remove_empty_schema_columns=remove_empty_schema_columns, truncation_length=truncation_length) parser.parse() def spreadsheet_output(spreadsheet_output_class, name): spreadsheet_output = spreadsheet_output_class( parser=parser, main_sheet_name=main_sheet_name, output_name=name, sheet_prefix=sheet_prefix) spreadsheet_output.write_sheets() if output_format == 'all': if not output_name: output_name = 'flattened' for format_name, spreadsheet_output_class in OUTPUT_FORMATS.items(): spreadsheet_output(spreadsheet_output_class, output_name+FORMATS_SUFFIX[format_name]) elif output_format in OUTPUT_FORMATS.keys(): # in dictionary of allowed formats if not output_name: output_name = 'flattened' + FORMATS_SUFFIX[output_format] spreadsheet_output(OUTPUT_FORMATS[output_format], output_name) else: raise Exception('The requested format is not available')
def test_flatten_multiplesheets( use_titles, use_schema, root_id, root_id_kwargs, input_list, expected_output_dict, recwarn, comment, warning_messages, tmpdir, reversible, ): # Not sure why, but this seems to be necessary to have warnings picked up # on Python 2.7 and 3.3, but 3.4 and 3.5 are fine without it import warnings warnings.simplefilter("always") extra_kwargs = {"use_titles": use_titles} extra_kwargs.update(root_id_kwargs) if use_schema: schema_parser = SchemaParser(root_schema_dict=create_schema(root_id) if use_schema else {"properties": {}}, rollup=True, **extra_kwargs) schema_parser.parse() else: schema_parser = None with tmpdir.join("input.json").open("w") as fp: json.dump( { "mykey": [ inject_root_id(root_id, input_row) for input_row in input_list ] }, fp, ) parser = JSONParser(json_filename=tmpdir.join("input.json").strpath, root_list_path="mykey", schema_parser=schema_parser, **extra_kwargs) parser.parse() expected_output_dict = OrderedDict([ (sheet_name, [inject_root_id(root_id, line) for line in lines]) for sheet_name, lines in expected_output_dict.items() ]) output = { sheet_name: sheet.lines for sheet_name, sheet in parser.sub_sheets.items() if sheet.lines } output["custom_main"] = parser.main_sheet.lines assert output == expected_output_dict
def test_root_list_path(): parser = JSONParser( root_json_dict={"custom_key": [OrderedDict([("a", "b"), ("c", "d"),])]}, root_list_path="custom_key", ) parser.parse() assert list(parser.main_sheet) == ["a", "c"] assert parser.main_sheet.lines == [{"a": "b", "c": "d"}] assert parser.sub_sheets == {}
def test_parse_nested_list_json_dict(): parser = JSONParser( root_json_dict=[OrderedDict([("a", "b"), ("c", [OrderedDict([("d", "e")])]),])] ) parser.parse() assert list(parser.main_sheet) == ["a"] assert parser.main_sheet.lines == [{"a": "b"}] listify(parser.sub_sheets) == {"c": ["d"]} parser.sub_sheets["c"].lines == [{"d": "e"}]
def test_flatten( use_titles, use_schema, root_id, root_id_kwargs, input_list, expected_output_list, recwarn, comment, warning_messages, tmpdir, reversible, ): # Not sure why, but this seems to be necessary to have warnings picked up # on Python 2.7 and 3.3, but 3.4 and 3.5 are fine without it import warnings warnings.simplefilter("always") extra_kwargs = {"use_titles": use_titles} extra_kwargs.update(root_id_kwargs) if use_schema: schema_parser = SchemaParser(root_schema_dict=create_schema(root_id) if use_schema else {"properties": {}}, rollup=True, **extra_kwargs) schema_parser.parse() else: schema_parser = None with tmpdir.join("input.json").open("w") as fp: json.dump( { "mykey": [ inject_root_id(root_id, input_row) for input_row in input_list ] }, fp, ) parser = JSONParser(json_filename=tmpdir.join("input.json").strpath, root_list_path="mykey", schema_parser=schema_parser, **extra_kwargs) parser.parse() expected_output_list = [ inject_root_id(root_id, expected_output_dict) for expected_output_dict in expected_output_list ] if expected_output_list == [{}]: # We don't expect an empty dictionary expected_output_list = [] assert list(parser.main_sheet.lines) == expected_output_list
def test_parse_ids_subsheet(self): parser = JSONParser(root_json_dict=[ OrderedDict([ ('ocid', 1), ('id', 2), ( 'testnest', [ OrderedDict([ ('id', 3), ('a', 'b'), ('c', [ OrderedDict([('d', 'e')]), OrderedDict([('d', 'e2')]) ]), ('f', { 'g': 'h' }) # Check that having nested objects doesn't break ID output ]) ]) ]) ]) parser.parse() assert list(parser.main_sheet) == ['ocid', 'id'] assert parser.main_sheet.lines == [{ 'ocid': 1, 'id': 2, }] assert listify(parser.sub_sheets) == { 'testnest': ['ocid', 'main/id', 'id', 'a', 'f/g'], 'c': ['ocid', 'main/id', 'main/testnest[]/id', 'd'] } assert parser.sub_sheets['testnest'].lines == [ { 'ocid': 1, 'main/id': 2, 'id': 3, 'a': 'b', 'f/g': 'h', }, ] assert parser.sub_sheets['c'].lines == [ { 'ocid': 1, 'main/id': 2, 'main/testnest[]/id': 3, 'd': 'e' }, { 'ocid': 1, 'main/id': 2, 'main/testnest[]/id': 3, 'd': 'e2' }, ]
def test_two_parents(self): # This is a copy of test_two_parents from test_schema_parser.py, in # order to check that flattening and template generation use the same # sheet names schema_parser = SchemaParser( root_schema_dict={ 'properties': OrderedDict([('Atest', { 'type': 'array', 'items': { 'type': 'object', 'properties': object_in_array_example_properties('Btest', 'Ctest') } }), ('Dtest', { 'type': 'array', 'items': { 'type': 'object', 'properties': object_in_array_example_properties( 'Btest', 'Etest') } })]) }) schema_parser.parse() parser = JSONParser(root_json_dict=[{ 'Atest': [{ 'id': 1, 'Btest': [{ 'Ctest': 2 }] }], 'Dtest': [{ 'id': 3, 'Btest': [{ 'Etest': 4 }] }] }], schema_parser=schema_parser) parser.parse() assert set(parser.main_sheet) == set() assert set(parser.sub_sheets) == set( ['Atest', 'Dtest', 'Ate_Btest', 'Dte_Btest']) assert list(parser.sub_sheets['Atest']) == ['Atest/0/id'] assert list(parser.sub_sheets['Dtest']) == ['Dtest/0/id'] assert list(parser.sub_sheets['Ate_Btest']) == [ 'Atest/0/id', 'Atest/0/Btest/0/Ctest' ] assert list(parser.sub_sheets['Dte_Btest']) == [ 'Dtest/0/id', 'Dtest/0/Btest/0/Etest' ]
def test_parse_nested_dict_json_dict(): parser = JSONParser(root_json_dict=[ OrderedDict([ ('a', 'b'), ('c', OrderedDict([('d', 'e')])), ]) ]) parser.parse() assert list(parser.main_sheet) == ['a', 'c/d'] assert parser.main_sheet.lines == [{'a': 'b', 'c/d': 'e'}] assert parser.sub_sheets == {}
def test_xml_empty(): parser = JSONParser(json_filename='flattentool/tests/fixtures/empty.xml', root_list_path='iati-activity', schema_parser=None, root_id='', xml=True, id_name='iati-identifier') parser.parse() assert list(parser.main_sheet) == [] assert parser.main_sheet.lines == [] assert parser.sub_sheets == {}
def test_rollup_multiple_values(self, recwarn): schema_parser = SchemaParser( root_schema_dict={ "properties": { "testA": { "type": "array", "rollUp": ["testB"], "items": { "type": "object", "properties": { "testB": {"type": "string"}, "testC": {"type": "string"}, }, }, }, } }, rollup=True, ) schema_parser.parse() parser = JSONParser( root_json_dict=[ OrderedDict( [ ( "testA", [ OrderedDict([("testB", "1"), ("testC", "2")]), OrderedDict([("testB", "3"), ("testC", "4")]), ], ), ] ) ], schema_parser=schema_parser, rollup=True, ) parser.parse() assert list(parser.main_sheet) == ["testA/0/testB"] assert parser.main_sheet.lines == [ { "testA/0/testB": "WARNING: More than one value supplied, consult the relevant sub-sheet for the data." } ] assert len(parser.sub_sheets) == 1 assert set(parser.sub_sheets["testA"]) == set( ["testA/0/testB", "testA/0/testC"] ) assert parser.sub_sheets["testA"].lines == [ {"testA/0/testB": "1", "testA/0/testC": "2"}, {"testA/0/testB": "3", "testA/0/testC": "4"}, ] w = recwarn.pop(UserWarning) assert "Could not provide rollup" in str(w.message)
def test_parse_nested_dict_json_dict(): parser = JSONParser(root_json_dict=[OrderedDict([ ('a', 'b'), ('c', OrderedDict([('d', 'e')])), ])]) parser.parse() assert list(parser.main_sheet) == [ 'a', 'c/d' ] assert parser.main_sheet.lines == [ {'a': 'b', 'c/d': 'e'} ] assert parser.sub_sheets == {}
def test_jsonparser_arguments_exceptions(tmpdir): """ Test that JSONParser throws a ValueError if it recievs too many or too few arguments. """ test_json = tmpdir.join("test.json") test_json.write("{}") with pytest.raises(ValueError): JSONParser() with pytest.raises(ValueError): JSONParser(json_filename=test_json.strpath, root_json_dict={})
def test_jsonparser_bad_json_utf8(): name = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'fixtures', 'bad-utf8.json') # matches against the special error type with pytest.raises(BadlyFormedJSONErrorUTF8): JSONParser(json_filename=name) # matches against our base error type with pytest.raises(BadlyFormedJSONError): JSONParser(json_filename=name) # matches against Python base error type with pytest.raises(ValueError): JSONParser(json_filename=name)
def test_xml_empty(): parser = JSONParser( json_filename='flattentool/tests/fixtures/empty.xml', root_list_path='iati-activity', schema_parser=None, root_id='', xml=True, id_name='iati-identifier') parser.parse() assert list(parser.main_sheet) == [] assert parser.main_sheet.lines == [] assert parser.sub_sheets == {}
def test_root_list_path(): parser = JSONParser(root_json_dict={ 'custom_key': [OrderedDict([ ('a', 'b'), ('c', 'd'), ])] }, root_list_path='custom_key') parser.parse() assert list(parser.main_sheet) == ['a', 'c'] assert parser.main_sheet.lines == [{'a': 'b', 'c': 'd'}] assert parser.sub_sheets == {}
def test_parse_nested_list_json_dict(): parser = JSONParser(root_json_dict=[ OrderedDict([ ('a', 'b'), ('c', [OrderedDict([('d', 'e')])]), ]) ]) parser.parse() assert list(parser.main_sheet) == ['a'] assert parser.main_sheet.lines == [{'a': 'b'}] listify(parser.sub_sheets) == {'c': ['d']} parser.sub_sheets['c'].lines == [{'d': 'e'}]
def test_parse_nested_list_json_dict(): parser = JSONParser(root_json_dict=[OrderedDict([ ('a', 'b'), ('c', [OrderedDict([('d', 'e')])]), ])]) parser.parse() assert list(parser.main_sheet) == [ 'a' ] assert parser.main_sheet.lines == [ {'a': 'b'} ] listify(parser.sub_sheets) == {'c': ['d']} parser.sub_sheets['c'].lines == [{'d':'e'}]
def test_parse_array(): parser = JSONParser(root_json_dict=[OrderedDict([ ('testarray', ['item','anotheritem']) ])]) parser.parse() assert list(parser.main_sheet) == [ 'testarray' ] assert parser.main_sheet.lines == [ { 'testarray': 'item;anotheritem' } ] assert parser.sub_sheets == {}
def test_xml_whitespace(): parser = JSONParser( json_filename='flattentool/tests/fixtures/narrative_whitespace.xml', root_list_path='iati-activity', schema_parser=None, root_id='', xml=True, id_name='iati-identifier') try: parser.parse() except TypeError as e: raise e
def test_root_list_path(): parser = JSONParser( root_json_dict={'custom_key': [OrderedDict([ ('a', 'b'), ('c', 'd'), ])]}, root_list_path='custom_key') parser.parse() assert list(parser.main_sheet) == [ 'a', 'c' ] assert parser.main_sheet.lines == [ {'a': 'b', 'c': 'd'} ] assert parser.sub_sheets == {}
def test_sub_sheets(self, tmpdir, remove_empty_schema_columns): test_schema = tmpdir.join('test.json') test_schema.write('''{ "properties": { "c": { "type": "array", "items": {"$ref": "#/testB"} }, "g": { "type": "array", "items": { "type": "object", "properties": { "h": { "type": "string"} } } } }, "testB": { "type": "object", "properties": { "d": { "type": "string" }, "f": { "type": "string" } } } }''') schema_parser = SchemaParser(schema_filename=test_schema.strpath, root_id='ocid') schema_parser.parse() parser = JSONParser( root_json_dict=[ OrderedDict([ ('a', 'b'), ('c', [OrderedDict([('d', 'e')])]), ]) ], schema_parser=schema_parser, remove_empty_schema_columns=remove_empty_schema_columns, ) parser.parse() assert list(parser.main_sheet) == ['a'] assert parser.main_sheet.lines == [{'a': 'b'}] assert len( parser.sub_sheets) == 2 if not remove_empty_schema_columns else 1 if not remove_empty_schema_columns: assert list(parser.sub_sheets['c']) == list( ['ocid', 'c/0/d', 'c/0/f']) assert list(parser.sub_sheets['g']) == list(['ocid', 'g/0/h']) else: assert list(parser.sub_sheets['c']) == list(['ocid', 'c/0/d']) assert parser.sub_sheets['c'].lines == [{'c/0/d': 'e'}]
def test_sub_sheets(self, tmpdir, remove_empty_schema_columns): test_schema = tmpdir.join('test.json') test_schema.write('''{ "properties": { "c": { "type": "array", "items": {"$ref": "#/testB"} }, "g": { "type": "array", "items": { "type": "object", "properties": { "h": { "type": "string"} } } } }, "testB": { "type": "object", "properties": { "d": { "type": "string" }, "f": { "type": "string" } } } }''') schema_parser = SchemaParser( schema_filename=test_schema.strpath, root_id='ocid' ) schema_parser.parse() parser = JSONParser( root_json_dict=[OrderedDict([ ('a', 'b'), ('c', [OrderedDict([('d', 'e')])]), ])], schema_parser=schema_parser, remove_empty_schema_columns=remove_empty_schema_columns, ) parser.parse() assert list(parser.main_sheet) == [ 'a' ] assert parser.main_sheet.lines == [ {'a': 'b'} ] assert len(parser.sub_sheets) == 2 if not remove_empty_schema_columns else 1 if not remove_empty_schema_columns: assert list(parser.sub_sheets['c']) == list(['ocid', 'c/0/d', 'c/0/f']) assert list(parser.sub_sheets['g']) == list(['ocid', 'g/0/h']) else: assert list(parser.sub_sheets['c']) == list(['ocid', 'c/0/d']) assert parser.sub_sheets['c'].lines == [{'c/0/d':'e'}]
def test_parse_basic_json_dict(): parser = JSONParser( root_json_dict=[ OrderedDict([("a", "b"), ("c", "d"),]), OrderedDict([("a", "e"), ("c", "f"),]), ] ) parser.parse() assert list(parser.main_sheet) == ["a", "c"] assert parser.main_sheet.lines == [ {"a": "b", "c": "d"}, {"a": "e", "c": "f"}, ] assert parser.sub_sheets == {}
def test_rollup_multiple_values(self, recwarn): schema_parser = SchemaParser(root_schema_dict={ 'properties': { 'testA': { 'type': 'array', 'rollUp': ['testB'], 'items': { 'type': 'object', 'properties': { 'testB': { 'type': 'string' }, 'testC': { 'type': 'string' } } } }, } }, rollup=True) schema_parser.parse() parser = JSONParser(root_json_dict=[ OrderedDict([ ('testA', [ OrderedDict([('testB', '1'), ('testC', '2')]), OrderedDict([('testB', '3'), ('testC', '4')]) ]), ]) ], schema_parser=schema_parser, rollup=True) parser.parse() assert list(parser.main_sheet) == ['testA/0/testB'] assert parser.main_sheet.lines == [{ 'testA/0/testB': 'WARNING: More than one value supplied, consult the relevant sub-sheet for the data.' }] assert len(parser.sub_sheets) == 1 assert set(parser.sub_sheets['testA']) == set( ['testA/0/testB', 'testA/0/testC']) assert parser.sub_sheets['testA'].lines == [{ 'testA/0/testB': '1', 'testA/0/testC': '2' }, { 'testA/0/testB': '3', 'testA/0/testC': '4' }] w = recwarn.pop(UserWarning) assert 'Could not provide rollup' in str(w.message)
def test_sub_sheets(self, tmpdir, remove_empty_schema_columns): test_schema = tmpdir.join("test.json") test_schema.write( """{ "properties": { "c": { "type": "array", "items": {"$ref": "#/testB"} }, "g": { "type": "array", "items": { "type": "object", "properties": { "h": { "type": "string"} } } } }, "testB": { "type": "object", "properties": { "d": { "type": "string" }, "f": { "type": "string" } } } }""" ) schema_parser = SchemaParser( schema_filename=test_schema.strpath, root_id="ocid" ) schema_parser.parse() parser = JSONParser( root_json_dict=[ OrderedDict([("a", "b"), ("c", [OrderedDict([("d", "e")])]),]) ], schema_parser=schema_parser, remove_empty_schema_columns=remove_empty_schema_columns, ) parser.parse() assert list(parser.main_sheet) == ["a"] assert parser.main_sheet.lines == [{"a": "b"}] assert len(parser.sub_sheets) == 2 if not remove_empty_schema_columns else 1 if not remove_empty_schema_columns: assert list(parser.sub_sheets["c"]) == list(["ocid", "c/0/d", "c/0/f"]) assert list(parser.sub_sheets["g"]) == list(["ocid", "g/0/h"]) else: assert list(parser.sub_sheets["c"]) == list(["ocid", "c/0/d"]) assert parser.sub_sheets["c"].lines == [{"c/0/d": "e"}]
def test_parse_ids_subsheet(self): parser = JSONParser(root_json_dict=[OrderedDict([ ('custom', 1), ('id', 2), ('testnest', [ OrderedDict([ ('id', 3), ('a', 'b'), ('c', [OrderedDict([('d', 'e')]), OrderedDict([('d', 'e2')])]), ('f', {'g':'h'}) # Check that having nested objects doesn't break ID output ]) ]) ])], root_id='custom') parser.parse() assert list(parser.main_sheet) == [ 'custom', 'id' ] assert parser.main_sheet.lines == [ { 'custom': 1, 'id': 2, } ] assert listify(parser.sub_sheets) == { 'testnest': ['custom', 'id', 'testnest/0/id', 'testnest/0/a', 'testnest/0/f/g'], 'tes_c': ['custom', 'id', 'testnest/0/id', 'testnest/0/c/0/d'] } assert parser.sub_sheets['testnest'].lines == [ { 'custom': 1, 'id': 2, 'testnest/0/id': 3, 'testnest/0/a': 'b', 'testnest/0/f/g': 'h', }, ] assert parser.sub_sheets['tes_c'].lines == [ { 'custom': 1, 'id': 2, 'testnest/0/id': 3, 'testnest/0/c/0/d':'e' }, { 'custom': 1, 'id': 2, 'testnest/0/id': 3, 'testnest/0/c/0/d':'e2' }, ]
def flatten(input_name, schema=None, output_name='releases', output_format='all', main_sheet_name='main', root_list_path='releases', rollup=False, root_id='ocid', use_titles=False, **_): """ Flatten a nested structure (JSON) to a flat structure (spreadsheet - csv or xlsx). """ if schema: schema_parser = SchemaParser(schema_filename=schema, rollup=rollup, root_id=root_id, use_titles=use_titles, main_sheet_name=main_sheet_name) schema_parser.parse() else: schema_parser = None parser = JSONParser(json_filename=input_name, root_list_path=root_list_path, schema_parser=schema_parser, main_sheet_name=main_sheet_name, root_id=root_id, use_titles=use_titles) parser.parse() def spreadsheet_output(spreadsheet_output_class, name): spreadsheet_output = spreadsheet_output_class( parser=parser, main_sheet_name=main_sheet_name, output_name=name) spreadsheet_output.write_sheets() if output_format == 'all': for format_name, spreadsheet_output_class in OUTPUT_FORMATS.items(): spreadsheet_output(spreadsheet_output_class, output_name + FORMATS_SUFFIX[format_name]) elif output_format in OUTPUT_FORMATS.keys( ): # in dictionary of allowed formats spreadsheet_output(OUTPUT_FORMATS[output_format], output_name) else: raise Exception('The requested format is not available')
def test_parse_basic_json_dict(): parser = JSONParser(root_json_dict=[ OrderedDict([ ('a', 'b'), ('c', 'd'), ]), OrderedDict([ ('a', 'e'), ('c', 'f'), ]), ]) parser.parse() assert list(parser.main_sheet) == [ 'a', 'c' ] assert parser.main_sheet.lines == [ {'a': 'b', 'c': 'd'}, {'a': 'e', 'c': 'f'}, ] assert parser.sub_sheets == {}
def test_two_parents(self): # This is a copy of test_two_parents from test_schema_parser.py, in # order to check that flattening and template generation use the same # sheet names schema_parser = SchemaParser(root_schema_dict={ 'properties': OrderedDict([ ('Atest', { 'type': 'array', 'items': {'type': 'object', 'properties': object_in_array_example_properties('Btest', 'Ctest')} }), ('Dtest', { 'type': 'array', 'items': {'type': 'object', 'properties': object_in_array_example_properties('Btest', 'Etest')} }) ]) }) schema_parser.parse() parser = JSONParser( root_json_dict=[{ 'Atest': [{ 'id': 1, 'Btest': [{ 'Ctest': 2 }] }], 'Dtest': [{ 'id': 3, 'Btest': [{ 'Etest': 4 }] }] }], schema_parser=schema_parser ) parser.parse() assert set(parser.main_sheet) == set() assert set(parser.sub_sheets) == set(['Atest', 'Dtest', 'Ate_Btest', 'Dte_Btest']) assert list(parser.sub_sheets['Atest']) == ['Atest/0/id'] assert list(parser.sub_sheets['Dtest']) == ['Dtest/0/id'] assert list(parser.sub_sheets['Ate_Btest']) == ['Atest/0/id', 'Atest/0/Btest/0/Ctest'] assert list(parser.sub_sheets['Dte_Btest']) == ['Dtest/0/id', 'Dtest/0/Btest/0/Etest']
def test_column_matching(self, tmpdir): test_schema = tmpdir.join('test.json') test_schema.write('''{ "properties": { "c": { "type": "array", "items": {"type": "string"} } } }''') schema_parser = SchemaParser(schema_filename=test_schema.strpath) schema_parser.parse() parser = JSONParser(root_json_dict=[OrderedDict([ ('c', ['d']), ])], schema_parser=schema_parser) parser.parse() assert list(parser.main_sheet) == ['c'] assert parser.main_sheet.lines == [{'c': 'd'}] assert len(parser.sub_sheets) == 0
def test_rollup_multiple_values(self, recwarn): schema_parser = SchemaParser(root_schema_dict={ 'properties': { 'testA': { 'type': 'array', 'rollUp': [ 'testB' ], 'items': { 'type': 'object', 'properties': { 'testB': {'type': 'string'}, 'testC': {'type': 'string'} } } }, } }, rollup=True) schema_parser.parse() parser = JSONParser( root_json_dict=[OrderedDict([ ('testA', [ OrderedDict([('testB', '1'), ('testC', '2')]), OrderedDict([('testB', '3'), ('testC', '4')]) ]), ])], schema_parser=schema_parser ) parser.parse() assert list(parser.main_sheet) == [ 'testA[]/testB' ] assert parser.main_sheet.lines == [ { 'testA[]/testB': 'WARNING: More than one value supplied, consult the relevant sub-sheet for the data.' } ] assert len(parser.sub_sheets) == 1 assert set(parser.sub_sheets['testA']) == set(['ocid', 'testB', 'testC']) assert parser.sub_sheets['testA'].lines == [ {'testB':'1', 'testC': '2'}, {'testB':'3', 'testC': '4'} ] w = recwarn.pop(UserWarning) assert 'Could not provide rollup' in text_type(w.message)
def flatten(input_name, schema=None, output_name='releases', output_format='all', main_sheet_name='main', root_list_path='releases', rollup=False, root_id='ocid', use_titles=False, **_): """ Flatten a nested structure (JSON) to a flat structure (spreadsheet - csv or xlsx). """ if schema: schema_parser = SchemaParser( schema_filename=schema, rollup=rollup, root_id=root_id, use_titles=use_titles, main_sheet_name=main_sheet_name) schema_parser.parse() else: schema_parser = None parser = JSONParser( json_filename=input_name, root_list_path=root_list_path, schema_parser=schema_parser, main_sheet_name=main_sheet_name, root_id=root_id, use_titles=use_titles) parser.parse() def spreadsheet_output(spreadsheet_output_class, name): spreadsheet_output = spreadsheet_output_class( parser=parser, main_sheet_name=main_sheet_name, output_name=name) spreadsheet_output.write_sheets() if output_format == 'all': for format_name, spreadsheet_output_class in OUTPUT_FORMATS.items(): spreadsheet_output(spreadsheet_output_class, output_name+FORMATS_SUFFIX[format_name]) elif output_format in OUTPUT_FORMATS.keys(): # in dictionary of allowed formats spreadsheet_output(OUTPUT_FORMATS[output_format], output_name) else: raise Exception('The requested format is not available')
def test_varyin_transaction_count(): parser = JSONParser( json_filename='flattentool/tests/fixtures/varying_transaction_count.xml', root_list_path='iati-activity', schema_parser=None, root_id='', xml=True, id_name='iati-identifier') parser.parse() assert list(parser.main_sheet) == ['iati-identifier'] assert parser.main_sheet.lines == [ {'iati-identifier': 'AA-AAA-123456789-ABC123'}, {'iati-identifier': 'AA-AAA-123456789-ABC124'}, {'iati-identifier': 'AA-AAA-123456789-ABC125'}, ] assert set(parser.sub_sheets.keys()) == set(['transaction']) assert list(parser.sub_sheets['transaction']) == ['iati-identifier', 'transaction/0/transaction-date/@iso-date', 'transaction/0/value/@value-date', 'transaction/0/value'] assert parser.sub_sheets['transaction'].lines == [ {'iati-identifier': 'AA-AAA-123456789-ABC123', 'transaction/0/value/@value-date': '2012-01-01', 'transaction/0/transaction-date/@iso-date': '2012-01-01', 'transaction/0/value': '10'}, {'iati-identifier': 'AA-AAA-123456789-ABC123', 'transaction/0/value/@value-date': '2012-02-02', 'transaction/0/transaction-date/@iso-date': '2012-02-02', 'transaction/0/value': '20'}, {'iati-identifier': 'AA-AAA-123456789-ABC125', 'transaction/0/value/@value-date': '2012-03-03', 'transaction/0/transaction-date/@iso-date': '2012-03-03', 'transaction/0/value': '30'}, ]
def test_flatten(use_titles, use_schema, root_id, root_id_kwargs, input_list, expected_output_list, recwarn, comment, warning_messages, tmpdir, reversible): # Not sure why, but this seems to be necessary to have warnings picked up # on Python 2.7 and 3.3, but 3.4 and 3.5 are fine without it import warnings warnings.simplefilter('always') extra_kwargs = {'use_titles': use_titles} extra_kwargs.update(root_id_kwargs) if use_schema: schema_parser = SchemaParser( root_schema_dict=create_schema(root_id) if use_schema else {"properties": {}}, rollup=True, **extra_kwargs ) schema_parser.parse() else: schema_parser = None with tmpdir.join('input.json').open('w') as fp: json.dump({ 'mykey': [inject_root_id(root_id, input_row) for input_row in input_list] }, fp) parser = JSONParser( json_filename=tmpdir.join('input.json').strpath, root_list_path='mykey', schema_parser=schema_parser, **extra_kwargs) parser.parse() expected_output_list = [ inject_root_id(root_id, expected_output_dict) for expected_output_dict in expected_output_list ] if expected_output_list == [{}]: # We don't expect an empty dictionary expected_output_list = [] assert list(parser.main_sheet.lines) == expected_output_list
def test_sub_sheet_names(self, tmpdir): test_schema = tmpdir.join('test.json') test_schema.write('''{ "properties": { "c": { "type": "array", "items": {"$ref": "#/testB"} } }, "testB": { "type": "object", "properties": { "d": { "type": "string" }, "f": { "type": "string" } } } }''') schema_parser = SchemaParser( schema_filename=test_schema.strpath ) schema_parser.parse() parser = JSONParser( root_json_dict=[OrderedDict([ ('a', 'b'), ('c', [OrderedDict([('d', 'e')])]), ])], schema_parser=schema_parser ) parser.parse() assert list(parser.main_sheet) == [ 'a' ] assert parser.main_sheet.lines == [ {'a': 'b'} ] assert len(parser.sub_sheets) == 1 assert list(parser.sub_sheets['testB']) == list(['ocid', 'd', 'f']) assert parser.sub_sheets['testB'].lines == [{'d':'e'}]
def test_parse_empty_json_dict(): parser = JSONParser(root_json_dict={}) parser.parse() assert list(parser.main_sheet) == [] assert parser.main_sheet.lines == [] assert parser.sub_sheets == {}